# Preprocessing

In [1]:
# # You may need to run those in your enviroment terminal.

%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import spacy
import nltk
from nltk.corpus import wordnet
import random
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from datasets import Dataset
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score
import torch


nltk.download('wordnet')
nltk.download('omw-1.4')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hanee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hanee\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Creating data frame of the data and assigning them labels

In [3]:
texts = []
labels = []

base_path = Path('review_polarity/txt_sentoken')
pos_path = base_path / 'pos'
neg_path = base_path / 'neg'

# Assign label 1
if pos_path.exists():
    for file in pos_path.glob('*.txt'):
        with open(file, 'r', encoding = 'utf-8') as f:
            texts.append(f.read())
            labels.append(1)

# Assign label 0
if neg_path.exists():
    for file in neg_path.glob('*.txt'):
        with open(file, 'r', encoding = 'utf-8') as f:
            texts.append(f.read())
            labels.append(0)

df = pd.DataFrame({
    'label': labels,
    'text': texts
})

df.tail()

Unnamed: 0,label,text
1995,0,"if anything , "" stigmata "" should be taken as ..."
1996,0,"john boorman's "" zardoz "" is a goofy cinematic..."
1997,0,the kids in the hall are an acquired taste . \...
1998,0,there was a time when john carpenter was a gre...
1999,0,two party guys bob their heads to haddaway's d...


In [4]:
df[df.duplicated()]

Unnamed: 0,label,text


No duplicates were found

In [5]:
def spacy_pos_to_wordnet_pos(spacy_pos):
    if spacy_pos.startswith('N'):
        return wordnet.NOUN
    elif spacy_pos.startswith('V'):
        return wordnet.VERB
    elif spacy_pos.startswith('J'):
        return wordnet.ADJ
    elif spacy_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default fallback

In [6]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    synonyms.discard(word)      # Remove the original word to avoid replacement with itself
    return list(synonyms)

In [7]:
def augment_text_with_synonyms(tokens, pos_tags, synonym_probability = 0.2):
    augmented_tokens = []

    for token, pos_tag in zip(tokens, pos_tags):
        if random.random() < synonym_probability:
            if pos_tag in ['n', 'v', 'a']:   #nouns adjectives and verbs
                synonyms = get_synonyms(token)
                if synonyms:
                    new_word = random.choice(synonyms)
                    augmented_tokens.append(new_word)
                    continue
        augmented_tokens.append(token)         # Add the original word if no augmentation is done

    return augmented_tokens

In [8]:
def apply_augmentation_to_dataset(word_tokens, pos_tags, texts, labels, sentence_tokens, synonym_probability = 0.2):
    all_word_tokens = word_tokens.copy()
    all_texts = texts.copy()  # Original text is preserved
    all_labels = labels.copy()
    all_sentence_tokens = sentence_tokens.copy()
    all_pos_tags = pos_tags.copy()

    for tokens, pos, label, sentence, text in zip(word_tokens, pos_tags, labels, sentence_tokens, texts):
        augmented_tokens = augment_text_with_synonyms(tokens, pos, synonym_probability)

        all_word_tokens.append(augmented_tokens)
        all_texts.append(text)  # Keep the original text
        all_labels.append(label)
        all_pos_tags.append(pos)
        all_sentence_tokens.append(sentence)  # Sentence tokens are not augmented

    return all_word_tokens, all_pos_tags, all_texts, all_labels, all_sentence_tokens

In [9]:
def apply_lemmatization(tokens):
    return [token.lemma_ for token in nlp(' '.join(tokens))]

In [10]:
def apply_stemming(text):

    words = text.split()
    return ' '.join([PorterStemmer().stem(word) for word in words])

In [11]:
def visualize_tf_idf_heatmap(tfidf_matrix, feature_names, n_top_features = 20, n_top_docs = 10):
    # Get the top features by summing TF-IDF scores across documents
    tfidf_array = tfidf_matrix.toarray()
    feature_importance = np.sum(tfidf_array, axis = 0)
    top_feature_indices = np.argsort(feature_importance)[-n_top_features:][::-1]
    top_features = [feature_names[i] for i in top_feature_indices]

    # Get the top documents by summing TF-IDF scores across features
    doc_importance = np.sum(tfidf_array, axis=1)
    top_doc_indices = np.argsort(doc_importance)[-n_top_docs:][::-1]

    # Extract the submatrix for visualization
    sub_matrix = tfidf_array[np.ix_(top_doc_indices, top_feature_indices)]

    plt.figure(figsize = (12, 8))
    sns.heatmap(
        sub_matrix,
        annot = True,          # Show values in cells
        fmt = '.3f',           # Format with 3 decimal places
        cmap = 'YlGnBu',       # Better colormap
        xticklabels = top_features,
        yticklabels = range(n_top_docs)
    )
    plt.title(f'TF-IDF Heatmap (Top {n_top_features} Features, First {n_top_docs} Documents)')
    plt.xlabel('Words')
    plt.ylabel('Documents')
    plt.xticks(rotation = 45, ha = 'right')
    plt.tight_layout()
    plt.savefig('tfidf_heatmap.png')
    plt.close()

    # Bar chart of top features across the corpus
    plt.figure(figsize=(12, 6))
    top_features_scores = [feature_importance[i] for i in top_feature_indices]
    plt.bar(top_features, top_features_scores)
    plt.title('Top TF-IDF Features Across All Documents')
    plt.xlabel('Features')
    plt.ylabel('Sum of TF-IDF Scores')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('tfidf_top_features.png')
    plt.close()

In [12]:
texts = df['text'].tolist()
labels = df['label'].tolist()
word_tokens = []
sentence_tokens = []
pos_tags = []

for text in texts:
    doc = nlp(text)
    sentence_tokens.append([sent.text.strip() for sent in doc.sents])
    tokens = [token for token in doc if not token.is_punct and not token.is_stop and not token.is_space]
    word_tokens.append([token.text.lower() for token in tokens])
    pos_tags.append([spacy_pos_to_wordnet_pos(token.tag_) for token in tokens])

lemmatized_tokens = [apply_lemmatization(t) for t in word_tokens]

word_tokens, pos_tags, texts, labels, sentence_tokens = apply_augmentation_to_dataset(
    lemmatized_tokens, pos_tags, texts, labels, sentence_tokens
)

joined_texts = [' '.join(tokens) for tokens in word_tokens]

augmented_df = pd.DataFrame({
    'label': labels,
    'text': joined_texts,
    'word_tokens': word_tokens
})

augmented_df = augmented_df.sample(frac = 1, random_state = 42).reset_index(drop = True)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    augmented_df['text'], augmented_df['label'], test_size = 0.2, random_state = 42
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
vectorizer.fit(train_texts)
xtrain_tfidf = vectorizer.transform(train_texts)
xtest_tfidf = vectorizer.transform(test_texts)

tfidf_matrix = vectorizer.fit_transform(joined_texts)
feature_names = vectorizer.get_feature_names_out()

print(f"\nTF-IDF Matrix Shape: {tfidf_matrix.shape}")
print(f"Number of features: {len(feature_names)}")
visualize_tf_idf_heatmap(tfidf_matrix, feature_names)


TF-IDF Matrix Shape: (4000, 42868)
Number of features: 42868


In [13]:
print("augmented_df.head():")
display(augmented_df.head())

print("\ndf.head():")
display(df.head())

augmented_df.head():


Unnamed: 0,label,text,word_tokens
0,1,harmless silly fun comedy dim witte wrestle fa...,"[harmless, silly, fun, comedy, dim, witte, wre..."
1,0,point movie staging opera go completely wrong ...,"[point, movie, staging, opera, go, completely,..."
2,1,sick life death bob flanagan supermasochist fe...,"[sick, life, death, bob, flanagan, supermasoch..."
3,0,everybody film think alicia documentary see cr...,"[everybody, film, think, alicia, documentary, ..."
4,1,lisa cholodenko heights art intelligent quiet ...,"[lisa, cholodenko, heights, art, intelligent, ..."



df.head():


Unnamed: 0,label,text
0,1,films adapted from comic books have had plenty...
1,1,every now and then a movie comes along from a ...
2,1,you've got mail works alot better than it dese...
3,1,""" jaws "" is a rare film that grabs your atten..."
4,1,moviemaking is a lot like being the general ma...


In [14]:
print("augmented_df.tail():")
display(augmented_df.tail())

print("\ndf.tail():")
display(df.tail())

augmented_df.tail():


Unnamed: 0,label,text,word_tokens
3995,0,understand clich hell earth truly mean recentl...,"[understand, clich, hell, earth, truly, mean, ..."
3996,0,1954 japanese monster film godzilla transform ...,"[1954, japanese, monster, film, godzilla, tran..."
3997,1,verdict spine chilling drama horror maestro st...,"[verdict, spine, chilling, drama, horror, maes..."
3998,0,midway anaconda documentary filmmaker terri fl...,"[midway, anaconda, documentary, filmmaker, ter..."
3999,0,starship trooper regretful moving_picture mean...,"[starship, trooper, regretful, moving_picture,..."



df.tail():


Unnamed: 0,label,text
1995,0,"if anything , "" stigmata "" should be taken as ..."
1996,0,"john boorman's "" zardoz "" is a goofy cinematic..."
1997,0,the kids in the hall are an acquired taste . \...
1998,0,there was a time when john carpenter was a gre...
1999,0,two party guys bob their heads to haddaway's d...


In [15]:
print("\ndf dimentions:")
display(df.shape)

print("\naugmented_df dimentions:")
display(augmented_df.shape)


df dimentions:


(2000, 2)


augmented_df dimentions:


(4000, 3)

# Modelling - Classifiers

####        •    ML -> Logistic Regression, Naive Bayes, SVM, Decision tree, and Random Forest.

In [16]:
def train_model(classifier, feature_vector_train, train_labels, feature_vector_test, test_labels, is_neural_net=False):
    classifier.fit(feature_vector_train, train_labels)

    train_predictions = classifier.predict(feature_vector_train)
    test_predictions = classifier.predict(feature_vector_test)
    train_accuracy = metrics.accuracy_score(train_labels, train_predictions)
    test_accuracy = metrics.accuracy_score(test_labels, test_predictions)

    print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
    print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

#### 1. Logistic Regression

In [17]:
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_labels, xtest_tfidf, test_labels)

Train Accuracy: 98.06%
Test Accuracy: 91.38%


#### 2. Naive Bayes

In [18]:
accuracy = train_model(naive_bayes.MultinomialNB(alpha = 0.0001), xtrain_tfidf, train_labels, xtest_tfidf,test_labels)

Train Accuracy: 99.94%
Test Accuracy: 92.12%


#### 3. SVM

In [19]:
accuracy = train_model(svm.SVC(kernel = 'linear', C = 1.0), xtrain_tfidf, train_labels, xtest_tfidf, test_labels)

Train Accuracy: 99.62%
Test Accuracy: 95.50%


In [20]:
accuracy = train_model(svm.SVC(kernel = 'rbf', C = 1.0), xtrain_tfidf, train_labels, xtest_tfidf, test_labels)

Train Accuracy: 99.94%
Test Accuracy: 96.38%


In [21]:
accuracy = train_model(svm.SVC(kernel = 'poly', C = 1.0, degree = 3), xtrain_tfidf, train_labels, xtest_tfidf, test_labels)

Train Accuracy: 100.00%
Test Accuracy: 92.88%


#### 4. Decision Tree

In [22]:
accuracy = train_model(DecisionTreeClassifier(), xtrain_tfidf, train_labels, xtest_tfidf, test_labels)

Train Accuracy: 100.00%
Test Accuracy: 79.75%


In [23]:
accuracy = train_model(RandomForestClassifier(), xtrain_tfidf, train_labels, xtest_tfidf, test_labels)

Train Accuracy: 100.00%
Test Accuracy: 94.88%


#### • DL -> BERT

In [28]:


# Use same train/test split for BERT
bert_train_texts = train_texts.tolist()
bert_test_texts = test_texts.tolist()
bert_train_labels = train_labels.tolist()
bert_test_labels = test_labels.tolist()

bert_train_ds = Dataset.from_dict({"text": bert_train_texts, "label": bert_train_labels})
bert_test_ds = Dataset.from_dict({"text": bert_test_texts, "label": bert_test_labels})


In [29]:


# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the datasets
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)

bert_train_ds = bert_train_ds.map(tokenize_function, batched=True)
bert_test_ds = bert_test_ds.map(tokenize_function, batched=True)

# Set PyTorch format
bert_train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
bert_test_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])


Map: 100%|██████████| 3200/3200 [00:14<00:00, 223.16 examples/s]
Map: 100%|██████████| 800/800 [00:04<00:00, 181.77 examples/s]


In [33]:

# Load pre-trained BERT for classification
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Define evaluation metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return {"accuracy": accuracy_score(labels, predictions)}


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="no",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=bert_train_ds,
    eval_dataset=bert_test_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

                                                   
 16%|█▌        | 124/800 [52:24<1:28:46,  7.88s/it]

{'loss': 0.6949, 'learning_rate': 4.96875e-05, 'epoch': 0.01}


                                                   
 16%|█▌        | 124/800 [52:34<1:28:46,  7.88s/it]

{'loss': 0.7136, 'learning_rate': 4.937500000000001e-05, 'epoch': 0.03}


                                                   
 16%|█▌        | 124/800 [52:43<1:28:46,  7.88s/it]

{'loss': 0.6651, 'learning_rate': 4.90625e-05, 'epoch': 0.04}


                                                   
 16%|█▌        | 124/800 [52:53<1:28:46,  7.88s/it]

{'loss': 0.6833, 'learning_rate': 4.875e-05, 'epoch': 0.05}


                                                   
 16%|█▌        | 124/800 [53:02<1:28:46,  7.88s/it]

{'loss': 0.6821, 'learning_rate': 4.8437500000000005e-05, 'epoch': 0.06}


                                                   
 16%|█▌        | 124/800 [53:12<1:28:46,  7.88s/it]

{'loss': 0.7037, 'learning_rate': 4.8125000000000004e-05, 'epoch': 0.07}


                                                   
 16%|█▌        | 124/800 [53:23<1:28:46,  7.88s/it]

{'loss': 0.6878, 'learning_rate': 4.7812500000000003e-05, 'epoch': 0.09}


                                                   
 16%|█▌        | 124/800 [53:35<1:28:46,  7.88s/it]

{'loss': 0.6995, 'learning_rate': 4.75e-05, 'epoch': 0.1}


                                                   
 16%|█▌        | 124/800 [53:46<1:28:46,  7.88s/it]

{'loss': 0.7071, 'learning_rate': 4.71875e-05, 'epoch': 0.11}


                                                   
 16%|█▌        | 124/800 [53:58<1:28:46,  7.88s/it]

{'loss': 0.699, 'learning_rate': 4.6875e-05, 'epoch': 0.12}


                                                   
 16%|█▌        | 124/800 [54:10<1:28:46,  7.88s/it]

{'loss': 0.679, 'learning_rate': 4.65625e-05, 'epoch': 0.14}


                                                   
 16%|█▌        | 124/800 [54:21<1:28:46,  7.88s/it]

{'loss': 0.6936, 'learning_rate': 4.6250000000000006e-05, 'epoch': 0.15}


                                                   
 16%|█▌        | 124/800 [54:33<1:28:46,  7.88s/it]

{'loss': 0.6877, 'learning_rate': 4.59375e-05, 'epoch': 0.16}


                                                   
 16%|█▌        | 124/800 [54:44<1:28:46,  7.88s/it]

{'loss': 0.6793, 'learning_rate': 4.5625e-05, 'epoch': 0.17}


                                                   
 16%|█▌        | 124/800 [54:57<1:28:46,  7.88s/it]

{'loss': 0.709, 'learning_rate': 4.5312500000000004e-05, 'epoch': 0.19}


                                                   
 16%|█▌        | 124/800 [55:08<1:28:46,  7.88s/it]

{'loss': 0.7628, 'learning_rate': 4.5e-05, 'epoch': 0.2}


                                                   
 16%|█▌        | 124/800 [55:20<1:28:46,  7.88s/it]

{'loss': 0.674, 'learning_rate': 4.46875e-05, 'epoch': 0.21}


                                                   
 16%|█▌        | 124/800 [55:31<1:28:46,  7.88s/it]

{'loss': 0.6997, 'learning_rate': 4.4375e-05, 'epoch': 0.23}


                                                   
 16%|█▌        | 124/800 [55:41<1:28:46,  7.88s/it]

{'loss': 0.717, 'learning_rate': 4.40625e-05, 'epoch': 0.24}


                                                   
 16%|█▌        | 124/800 [55:52<1:28:46,  7.88s/it]

{'loss': 0.7824, 'learning_rate': 4.375e-05, 'epoch': 0.25}


                                                   
 16%|█▌        | 124/800 [56:04<1:28:46,  7.88s/it]

{'loss': 0.6403, 'learning_rate': 4.3437500000000006e-05, 'epoch': 0.26}


                                                   
 16%|█▌        | 124/800 [56:15<1:28:46,  7.88s/it]

{'loss': 0.7374, 'learning_rate': 4.3125000000000005e-05, 'epoch': 0.28}


                                                   
 16%|█▌        | 124/800 [56:27<1:28:46,  7.88s/it]

{'loss': 0.6881, 'learning_rate': 4.28125e-05, 'epoch': 0.29}


                                                   
 16%|█▌        | 124/800 [56:39<1:28:46,  7.88s/it]

{'loss': 0.6862, 'learning_rate': 4.25e-05, 'epoch': 0.3}


                                                   
 16%|█▌        | 124/800 [56:51<1:28:46,  7.88s/it]

{'loss': 0.5871, 'learning_rate': 4.21875e-05, 'epoch': 0.31}


                                                   
 16%|█▌        | 124/800 [57:02<1:28:46,  7.88s/it]

{'loss': 0.6859, 'learning_rate': 4.1875e-05, 'epoch': 0.33}


                                                   
 16%|█▌        | 124/800 [57:14<1:28:46,  7.88s/it]

{'loss': 0.7591, 'learning_rate': 4.156250000000001e-05, 'epoch': 0.34}


                                                   
 16%|█▌        | 124/800 [57:26<1:28:46,  7.88s/it]

{'loss': 0.7206, 'learning_rate': 4.125e-05, 'epoch': 0.35}


                                                   
 16%|█▌        | 124/800 [57:37<1:28:46,  7.88s/it]

{'loss': 0.7676, 'learning_rate': 4.09375e-05, 'epoch': 0.36}


                                                   
 16%|█▌        | 124/800 [57:47<1:28:46,  7.88s/it]

{'loss': 0.6201, 'learning_rate': 4.0625000000000005e-05, 'epoch': 0.38}


                                                   
 16%|█▌        | 124/800 [57:56<1:28:46,  7.88s/it]

{'loss': 0.6127, 'learning_rate': 4.0312500000000004e-05, 'epoch': 0.39}


                                                   
 16%|█▌        | 124/800 [58:06<1:28:46,  7.88s/it]

{'loss': 0.6794, 'learning_rate': 4e-05, 'epoch': 0.4}


                                                   
 16%|█▌        | 124/800 [58:16<1:28:46,  7.88s/it]

{'loss': 0.5358, 'learning_rate': 3.96875e-05, 'epoch': 0.41}


                                                   
 16%|█▌        | 124/800 [58:26<1:28:46,  7.88s/it]

{'loss': 0.5371, 'learning_rate': 3.9375e-05, 'epoch': 0.42}


                                                   
 16%|█▌        | 124/800 [58:38<1:28:46,  7.88s/it]

{'loss': 0.6883, 'learning_rate': 3.90625e-05, 'epoch': 0.44}


                                                   
 16%|█▌        | 124/800 [58:50<1:28:46,  7.88s/it]

{'loss': 0.5609, 'learning_rate': 3.875e-05, 'epoch': 0.45}


                                                   
 16%|█▌        | 124/800 [59:01<1:28:46,  7.88s/it]

{'loss': 0.6352, 'learning_rate': 3.8437500000000006e-05, 'epoch': 0.46}


                                                   
 16%|█▌        | 124/800 [59:10<1:28:46,  7.88s/it]

{'loss': 0.4498, 'learning_rate': 3.8125e-05, 'epoch': 0.47}


                                                   
 16%|█▌        | 124/800 [59:20<1:28:46,  7.88s/it]

{'loss': 0.7485, 'learning_rate': 3.78125e-05, 'epoch': 0.49}


                                                   
 16%|█▌        | 124/800 [59:30<1:28:46,  7.88s/it]

{'loss': 1.002, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.5}


                                                   
 16%|█▌        | 124/800 [59:39<1:28:46,  7.88s/it]

{'loss': 0.6762, 'learning_rate': 3.71875e-05, 'epoch': 0.51}


                                                   
 16%|█▌        | 124/800 [59:48<1:28:46,  7.88s/it]

{'loss': 0.6513, 'learning_rate': 3.6875e-05, 'epoch': 0.53}


                                                   
 16%|█▌        | 124/800 [59:57<1:28:46,  7.88s/it]

{'loss': 0.6114, 'learning_rate': 3.65625e-05, 'epoch': 0.54}


                                                   
 16%|█▌        | 124/800 [1:00:07<1:28:46,  7.88s/it]

{'loss': 0.6087, 'learning_rate': 3.625e-05, 'epoch': 0.55}


                                                     
 16%|█▌        | 124/800 [1:00:16<1:28:46,  7.88s/it]

{'loss': 0.6329, 'learning_rate': 3.59375e-05, 'epoch': 0.56}


                                                     
 16%|█▌        | 124/800 [1:00:25<1:28:46,  7.88s/it]

{'loss': 0.6015, 'learning_rate': 3.5625000000000005e-05, 'epoch': 0.57}


                                                     
 16%|█▌        | 124/800 [1:00:35<1:28:46,  7.88s/it]

{'loss': 0.5338, 'learning_rate': 3.5312500000000005e-05, 'epoch': 0.59}


                                                     
 16%|█▌        | 124/800 [1:00:46<1:28:46,  7.88s/it]

{'loss': 0.6918, 'learning_rate': 3.5e-05, 'epoch': 0.6}


                                                     
 16%|█▌        | 124/800 [1:00:55<1:28:46,  7.88s/it]

{'loss': 0.5852, 'learning_rate': 3.46875e-05, 'epoch': 0.61}


                                                     
 16%|█▌        | 124/800 [1:01:05<1:28:46,  7.88s/it]

{'loss': 0.6365, 'learning_rate': 3.4375e-05, 'epoch': 0.62}


                                                     
 16%|█▌        | 124/800 [1:01:15<1:28:46,  7.88s/it]

{'loss': 0.6172, 'learning_rate': 3.40625e-05, 'epoch': 0.64}


                                                     
 16%|█▌        | 124/800 [1:01:25<1:28:46,  7.88s/it]

{'loss': 0.5225, 'learning_rate': 3.375000000000001e-05, 'epoch': 0.65}


                                                     
 16%|█▌        | 124/800 [1:01:35<1:28:46,  7.88s/it]

{'loss': 0.4876, 'learning_rate': 3.34375e-05, 'epoch': 0.66}


                                                     
 16%|█▌        | 124/800 [1:01:45<1:28:46,  7.88s/it]

{'loss': 0.5132, 'learning_rate': 3.3125e-05, 'epoch': 0.68}


                                                     
 16%|█▌        | 124/800 [1:01:55<1:28:46,  7.88s/it]

{'loss': 0.5585, 'learning_rate': 3.2812500000000005e-05, 'epoch': 0.69}


                                                     
 16%|█▌        | 124/800 [1:02:04<1:28:46,  7.88s/it]

{'loss': 1.0942, 'learning_rate': 3.2500000000000004e-05, 'epoch': 0.7}


                                                     
 16%|█▌        | 124/800 [1:02:13<1:28:46,  7.88s/it]

{'loss': 0.5168, 'learning_rate': 3.21875e-05, 'epoch': 0.71}


                                                     
 16%|█▌        | 124/800 [1:02:22<1:28:46,  7.88s/it]

{'loss': 0.6261, 'learning_rate': 3.1875e-05, 'epoch': 0.72}


                                                     
 16%|█▌        | 124/800 [1:02:31<1:28:46,  7.88s/it]

{'loss': 0.5373, 'learning_rate': 3.15625e-05, 'epoch': 0.74}


                                                     
 16%|█▌        | 124/800 [1:02:41<1:28:46,  7.88s/it]

{'loss': 0.734, 'learning_rate': 3.125e-05, 'epoch': 0.75}


                                                     
 16%|█▌        | 124/800 [1:02:51<1:28:46,  7.88s/it]

{'loss': 0.9185, 'learning_rate': 3.09375e-05, 'epoch': 0.76}


                                                     
 16%|█▌        | 124/800 [1:03:03<1:28:46,  7.88s/it]

{'loss': 0.8238, 'learning_rate': 3.0625000000000006e-05, 'epoch': 0.78}


                                                     
 16%|█▌        | 124/800 [1:03:15<1:28:46,  7.88s/it]

{'loss': 0.5832, 'learning_rate': 3.0312499999999998e-05, 'epoch': 0.79}


                                                     
 16%|█▌        | 124/800 [1:03:26<1:28:46,  7.88s/it]

{'loss': 0.676, 'learning_rate': 3e-05, 'epoch': 0.8}


                                                     
 16%|█▌        | 124/800 [1:03:37<1:28:46,  7.88s/it]

{'loss': 0.4524, 'learning_rate': 2.96875e-05, 'epoch': 0.81}


                                                     
 16%|█▌        | 124/800 [1:03:49<1:28:46,  7.88s/it]

{'loss': 0.5444, 'learning_rate': 2.9375000000000003e-05, 'epoch': 0.82}


                                                     
 16%|█▌        | 124/800 [1:04:00<1:28:46,  7.88s/it]

{'loss': 0.4986, 'learning_rate': 2.9062500000000005e-05, 'epoch': 0.84}


                                                     
 16%|█▌        | 124/800 [1:04:12<1:28:46,  7.88s/it]

{'loss': 0.6724, 'learning_rate': 2.8749999999999997e-05, 'epoch': 0.85}


                                                     
 16%|█▌        | 124/800 [1:04:24<1:28:46,  7.88s/it]

{'loss': 0.264, 'learning_rate': 2.84375e-05, 'epoch': 0.86}


                                                     
 16%|█▌        | 124/800 [1:04:35<1:28:46,  7.88s/it]

{'loss': 0.4163, 'learning_rate': 2.8125000000000003e-05, 'epoch': 0.88}


                                                     
 16%|█▌        | 124/800 [1:04:47<1:28:46,  7.88s/it]

{'loss': 0.8597, 'learning_rate': 2.7812500000000002e-05, 'epoch': 0.89}


                                                     
 16%|█▌        | 124/800 [1:04:59<1:28:46,  7.88s/it]

{'loss': 0.5544, 'learning_rate': 2.7500000000000004e-05, 'epoch': 0.9}


                                                     
 16%|█▌        | 124/800 [1:05:10<1:28:46,  7.88s/it]

{'loss': 0.5256, 'learning_rate': 2.71875e-05, 'epoch': 0.91}


                                                     
 16%|█▌        | 124/800 [1:05:22<1:28:46,  7.88s/it]

{'loss': 0.719, 'learning_rate': 2.6875e-05, 'epoch': 0.93}


                                                     
 16%|█▌        | 124/800 [1:05:33<1:28:46,  7.88s/it]

{'loss': 0.3417, 'learning_rate': 2.6562500000000002e-05, 'epoch': 0.94}


                                                     
 16%|█▌        | 124/800 [1:05:44<1:28:46,  7.88s/it]

{'loss': 0.5647, 'learning_rate': 2.625e-05, 'epoch': 0.95}


                                                     
 16%|█▌        | 124/800 [1:05:56<1:28:46,  7.88s/it]

{'loss': 0.6001, 'learning_rate': 2.5937500000000004e-05, 'epoch': 0.96}


                                                     
 16%|█▌        | 124/800 [1:06:07<1:28:46,  7.88s/it]

{'loss': 0.5421, 'learning_rate': 2.5625e-05, 'epoch': 0.97}


                                                     
 16%|█▌        | 124/800 [1:06:19<1:28:46,  7.88s/it]

{'loss': 0.4268, 'learning_rate': 2.53125e-05, 'epoch': 0.99}


                                                     
 16%|█▌        | 124/800 [1:06:29<1:28:46,  7.88s/it]

{'loss': 0.9622, 'learning_rate': 2.5e-05, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                     

[A[A                                           
 16%|█▌        | 124/800 [1:07:19<1:28:46,  7.88s/it]
[A
[A

{'eval_loss': 0.6925792098045349, 'eval_accuracy': 0.77875, 'eval_runtime': 50.1062, 'eval_samples_per_second': 15.966, 'eval_steps_per_second': 3.992, 'epoch': 1.0}


                                                     
 16%|█▌        | 124/800 [1:07:28<1:28:46,  7.88s/it]

{'loss': 0.1866, 'learning_rate': 2.4687500000000004e-05, 'epoch': 1.01}


                                                     
 16%|█▌        | 124/800 [1:07:37<1:28:46,  7.88s/it]

{'loss': 0.2208, 'learning_rate': 2.4375e-05, 'epoch': 1.02}


                                                     
 16%|█▌        | 124/800 [1:07:46<1:28:46,  7.88s/it]

{'loss': 0.6386, 'learning_rate': 2.4062500000000002e-05, 'epoch': 1.04}


                                                     
 16%|█▌        | 124/800 [1:07:56<1:28:46,  7.88s/it]

{'loss': 0.5651, 'learning_rate': 2.375e-05, 'epoch': 1.05}


                                                     
 16%|█▌        | 124/800 [1:08:05<1:28:46,  7.88s/it]

{'loss': 0.6691, 'learning_rate': 2.34375e-05, 'epoch': 1.06}


                                                     
 16%|█▌        | 124/800 [1:08:14<1:28:46,  7.88s/it]

{'loss': 0.3993, 'learning_rate': 2.3125000000000003e-05, 'epoch': 1.07}


                                                     
 16%|█▌        | 124/800 [1:08:23<1:28:46,  7.88s/it]

{'loss': 0.4672, 'learning_rate': 2.28125e-05, 'epoch': 1.09}


                                                     
 16%|█▌        | 124/800 [1:08:32<1:28:46,  7.88s/it]

{'loss': 0.605, 'learning_rate': 2.25e-05, 'epoch': 1.1}


                                                     
 16%|█▌        | 124/800 [1:08:41<1:28:46,  7.88s/it]

{'loss': 0.3332, 'learning_rate': 2.21875e-05, 'epoch': 1.11}


                                                     
 16%|█▌        | 124/800 [1:08:51<1:28:46,  7.88s/it]

{'loss': 0.3247, 'learning_rate': 2.1875e-05, 'epoch': 1.12}


                                                     
 16%|█▌        | 124/800 [1:09:00<1:28:46,  7.88s/it]

{'loss': 0.153, 'learning_rate': 2.1562500000000002e-05, 'epoch': 1.14}


                                                     
 16%|█▌        | 124/800 [1:09:08<1:28:46,  7.88s/it]

{'loss': 0.5182, 'learning_rate': 2.125e-05, 'epoch': 1.15}


                                                     
 16%|█▌        | 124/800 [1:09:18<1:28:46,  7.88s/it]

{'loss': 0.8589, 'learning_rate': 2.09375e-05, 'epoch': 1.16}


                                                     
 16%|█▌        | 124/800 [1:09:30<1:28:46,  7.88s/it]

{'loss': 0.405, 'learning_rate': 2.0625e-05, 'epoch': 1.18}


                                                     
 16%|█▌        | 124/800 [1:09:42<1:28:46,  7.88s/it]

{'loss': 0.6322, 'learning_rate': 2.0312500000000002e-05, 'epoch': 1.19}


                                                     
 16%|█▌        | 124/800 [1:09:54<1:28:46,  7.88s/it]

{'loss': 0.7311, 'learning_rate': 2e-05, 'epoch': 1.2}


                                                     
 16%|█▌        | 124/800 [1:10:06<1:28:46,  7.88s/it]

{'loss': 0.5217, 'learning_rate': 1.96875e-05, 'epoch': 1.21}


                                                     
 16%|█▌        | 124/800 [1:10:17<1:28:46,  7.88s/it]

{'loss': 0.2812, 'learning_rate': 1.9375e-05, 'epoch': 1.23}


                                                     
 16%|█▌        | 124/800 [1:10:29<1:28:46,  7.88s/it]

{'loss': 0.3504, 'learning_rate': 1.90625e-05, 'epoch': 1.24}


                                                     
 16%|█▌        | 124/800 [1:10:41<1:28:46,  7.88s/it]

{'loss': 0.2746, 'learning_rate': 1.8750000000000002e-05, 'epoch': 1.25}


                                                     
 16%|█▌        | 124/800 [1:10:53<1:28:46,  7.88s/it]

{'loss': 0.1044, 'learning_rate': 1.84375e-05, 'epoch': 1.26}


                                                     
 16%|█▌        | 124/800 [1:11:05<1:28:46,  7.88s/it]

{'loss': 0.4267, 'learning_rate': 1.8125e-05, 'epoch': 1.27}


                                                     
 16%|█▌        | 124/800 [1:11:17<1:28:46,  7.88s/it]

{'loss': 0.3247, 'learning_rate': 1.7812500000000003e-05, 'epoch': 1.29}


                                                     
 16%|█▌        | 124/800 [1:11:29<1:28:46,  7.88s/it]

{'loss': 0.475, 'learning_rate': 1.75e-05, 'epoch': 1.3}


                                                     
 16%|█▌        | 124/800 [1:11:41<1:28:46,  7.88s/it]

{'loss': 0.6243, 'learning_rate': 1.71875e-05, 'epoch': 1.31}


                                                     
 16%|█▌        | 124/800 [1:11:53<1:28:46,  7.88s/it]

{'loss': 0.327, 'learning_rate': 1.6875000000000004e-05, 'epoch': 1.32}


                                                     
 16%|█▌        | 124/800 [1:12:05<1:28:46,  7.88s/it]

{'loss': 0.235, 'learning_rate': 1.65625e-05, 'epoch': 1.34}


                                                     
 16%|█▌        | 124/800 [1:12:17<1:28:46,  7.88s/it]

{'loss': 0.3182, 'learning_rate': 1.6250000000000002e-05, 'epoch': 1.35}


                                                     
 16%|█▌        | 124/800 [1:12:29<1:28:46,  7.88s/it]

{'loss': 0.3631, 'learning_rate': 1.59375e-05, 'epoch': 1.36}


                                                     
 16%|█▌        | 124/800 [1:12:40<1:28:46,  7.88s/it]

{'loss': 0.3104, 'learning_rate': 1.5625e-05, 'epoch': 1.38}


                                                     
 16%|█▌        | 124/800 [1:12:52<1:28:46,  7.88s/it]

{'loss': 0.5867, 'learning_rate': 1.5312500000000003e-05, 'epoch': 1.39}


                                                     
 16%|█▌        | 124/800 [1:13:04<1:28:46,  7.88s/it]

{'loss': 0.4737, 'learning_rate': 1.5e-05, 'epoch': 1.4}


                                                     
 16%|█▌        | 124/800 [1:13:16<1:28:46,  7.88s/it]

{'loss': 0.257, 'learning_rate': 1.4687500000000001e-05, 'epoch': 1.41}


                                                     
 16%|█▌        | 124/800 [1:13:28<1:28:46,  7.88s/it]

{'loss': 0.3026, 'learning_rate': 1.4374999999999999e-05, 'epoch': 1.43}


                                                     
 16%|█▌        | 124/800 [1:13:40<1:28:46,  7.88s/it]

{'loss': 0.0586, 'learning_rate': 1.4062500000000001e-05, 'epoch': 1.44}


                                                     
 16%|█▌        | 124/800 [1:13:52<1:28:46,  7.88s/it]

{'loss': 0.242, 'learning_rate': 1.3750000000000002e-05, 'epoch': 1.45}


                                                     
 16%|█▌        | 124/800 [1:14:03<1:28:46,  7.88s/it]

{'loss': 0.5873, 'learning_rate': 1.34375e-05, 'epoch': 1.46}


                                                     
 16%|█▌        | 124/800 [1:14:16<1:28:46,  7.88s/it]

{'loss': 0.4923, 'learning_rate': 1.3125e-05, 'epoch': 1.48}


                                                     
 16%|█▌        | 124/800 [1:14:27<1:28:46,  7.88s/it]

{'loss': 0.3872, 'learning_rate': 1.28125e-05, 'epoch': 1.49}


                                                     
 16%|█▌        | 124/800 [1:14:39<1:28:46,  7.88s/it]

{'loss': 0.6532, 'learning_rate': 1.25e-05, 'epoch': 1.5}


                                                     
 16%|█▌        | 124/800 [1:14:51<1:28:46,  7.88s/it]

{'loss': 0.2661, 'learning_rate': 1.21875e-05, 'epoch': 1.51}


                                                     
 16%|█▌        | 124/800 [1:15:03<1:28:46,  7.88s/it]

{'loss': 0.4156, 'learning_rate': 1.1875e-05, 'epoch': 1.52}


                                                     
 16%|█▌        | 124/800 [1:15:15<1:28:46,  7.88s/it]

{'loss': 0.3807, 'learning_rate': 1.1562500000000002e-05, 'epoch': 1.54}


                                                     
 16%|█▌        | 124/800 [1:15:27<1:28:46,  7.88s/it]

{'loss': 0.1825, 'learning_rate': 1.125e-05, 'epoch': 1.55}


                                                     
 16%|█▌        | 124/800 [1:15:38<1:28:46,  7.88s/it]

{'loss': 0.2172, 'learning_rate': 1.09375e-05, 'epoch': 1.56}


                                                     
 16%|█▌        | 124/800 [1:15:50<1:28:46,  7.88s/it]

{'loss': 0.209, 'learning_rate': 1.0625e-05, 'epoch': 1.57}


                                                     
 16%|█▌        | 124/800 [1:16:02<1:28:46,  7.88s/it]

{'loss': 0.4066, 'learning_rate': 1.03125e-05, 'epoch': 1.59}


                                                     
 16%|█▌        | 124/800 [1:16:14<1:28:46,  7.88s/it]

{'loss': 0.6688, 'learning_rate': 1e-05, 'epoch': 1.6}


                                                     
 16%|█▌        | 124/800 [1:16:26<1:28:46,  7.88s/it]

{'loss': 0.3604, 'learning_rate': 9.6875e-06, 'epoch': 1.61}


                                                     
 16%|█▌        | 124/800 [1:16:37<1:28:46,  7.88s/it]

{'loss': 0.1918, 'learning_rate': 9.375000000000001e-06, 'epoch': 1.62}


                                                     
 16%|█▌        | 124/800 [1:16:49<1:28:46,  7.88s/it]

{'loss': 0.2957, 'learning_rate': 9.0625e-06, 'epoch': 1.64}


                                                     
 16%|█▌        | 124/800 [1:17:01<1:28:46,  7.88s/it]

{'loss': 0.1856, 'learning_rate': 8.75e-06, 'epoch': 1.65}


                                                     
 16%|█▌        | 124/800 [1:17:13<1:28:46,  7.88s/it]

{'loss': 0.4526, 'learning_rate': 8.437500000000002e-06, 'epoch': 1.66}


                                                     
 16%|█▌        | 124/800 [1:17:25<1:28:46,  7.88s/it]

{'loss': 0.4037, 'learning_rate': 8.125000000000001e-06, 'epoch': 1.68}


                                                     
 16%|█▌        | 124/800 [1:17:37<1:28:46,  7.88s/it]

{'loss': 0.44, 'learning_rate': 7.8125e-06, 'epoch': 1.69}


                                                     
 16%|█▌        | 124/800 [1:17:49<1:28:46,  7.88s/it]

{'loss': 0.5121, 'learning_rate': 7.5e-06, 'epoch': 1.7}


                                                     
 16%|█▌        | 124/800 [1:18:00<1:28:46,  7.88s/it]

{'loss': 0.1664, 'learning_rate': 7.187499999999999e-06, 'epoch': 1.71}


                                                     
 16%|█▌        | 124/800 [1:18:12<1:28:46,  7.88s/it]

{'loss': 0.2296, 'learning_rate': 6.875000000000001e-06, 'epoch': 1.73}


                                                     
 16%|█▌        | 124/800 [1:18:24<1:28:46,  7.88s/it]

{'loss': 0.2952, 'learning_rate': 6.5625e-06, 'epoch': 1.74}


                                                     
 16%|█▌        | 124/800 [1:18:36<1:28:46,  7.88s/it]

{'loss': 0.3449, 'learning_rate': 6.25e-06, 'epoch': 1.75}


                                                     
 16%|█▌        | 124/800 [1:18:47<1:28:46,  7.88s/it]

{'loss': 0.4293, 'learning_rate': 5.9375e-06, 'epoch': 1.76}


                                                     
 16%|█▌        | 124/800 [1:18:59<1:28:46,  7.88s/it]

{'loss': 0.1872, 'learning_rate': 5.625e-06, 'epoch': 1.77}


                                                     
 16%|█▌        | 124/800 [1:19:11<1:28:46,  7.88s/it]

{'loss': 0.4643, 'learning_rate': 5.3125e-06, 'epoch': 1.79}


                                                     
 16%|█▌        | 124/800 [1:19:22<1:28:46,  7.88s/it]

{'loss': 0.6485, 'learning_rate': 5e-06, 'epoch': 1.8}


                                                     
 16%|█▌        | 124/800 [1:19:34<1:28:46,  7.88s/it]

{'loss': 0.1159, 'learning_rate': 4.6875000000000004e-06, 'epoch': 1.81}


                                                     
 16%|█▌        | 124/800 [1:19:46<1:28:46,  7.88s/it]

{'loss': 0.3913, 'learning_rate': 4.375e-06, 'epoch': 1.82}


                                                     
 16%|█▌        | 124/800 [1:19:58<1:28:46,  7.88s/it]

{'loss': 0.1918, 'learning_rate': 4.0625000000000005e-06, 'epoch': 1.84}


                                                     
 16%|█▌        | 124/800 [1:20:07<1:28:46,  7.88s/it]

{'loss': 0.2998, 'learning_rate': 3.75e-06, 'epoch': 1.85}


                                                     
 16%|█▌        | 124/800 [1:20:16<1:28:46,  7.88s/it]

{'loss': 0.1857, 'learning_rate': 3.4375000000000005e-06, 'epoch': 1.86}


                                                     
 16%|█▌        | 124/800 [1:20:25<1:28:46,  7.88s/it]

{'loss': 0.2649, 'learning_rate': 3.125e-06, 'epoch': 1.88}


                                                     
 16%|█▌        | 124/800 [1:20:34<1:28:46,  7.88s/it]

{'loss': 0.6797, 'learning_rate': 2.8125e-06, 'epoch': 1.89}


                                                     
 16%|█▌        | 124/800 [1:20:46<1:28:46,  7.88s/it]

{'loss': 0.3305, 'learning_rate': 2.5e-06, 'epoch': 1.9}


                                                     
 16%|█▌        | 124/800 [1:20:58<1:28:46,  7.88s/it]

{'loss': 0.4049, 'learning_rate': 2.1875e-06, 'epoch': 1.91}


                                                     
 16%|█▌        | 124/800 [1:21:10<1:28:46,  7.88s/it]

{'loss': 0.3921, 'learning_rate': 1.875e-06, 'epoch': 1.93}


                                                     
 16%|█▌        | 124/800 [1:21:22<1:28:46,  7.88s/it]

{'loss': 0.2064, 'learning_rate': 1.5625e-06, 'epoch': 1.94}


                                                     
 16%|█▌        | 124/800 [1:21:34<1:28:46,  7.88s/it]

{'loss': 0.5078, 'learning_rate': 1.25e-06, 'epoch': 1.95}


                                                     
 16%|█▌        | 124/800 [1:21:45<1:28:46,  7.88s/it]

{'loss': 0.3842, 'learning_rate': 9.375e-07, 'epoch': 1.96}


                                                     
 16%|█▌        | 124/800 [1:21:57<1:28:46,  7.88s/it]

{'loss': 0.3559, 'learning_rate': 6.25e-07, 'epoch': 1.98}


                                                     
 16%|█▌        | 124/800 [1:22:09<1:28:46,  7.88s/it]

{'loss': 0.3508, 'learning_rate': 3.125e-07, 'epoch': 1.99}


                                                     
 16%|█▌        | 124/800 [1:22:21<1:28:46,  7.88s/it]

{'loss': 0.3122, 'learning_rate': 0.0, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                     

[A[A                                           
 16%|█▌        | 124/800 [1:23:22<1:28:46,  7.88s/it]
[A
                                        

{'eval_loss': 0.5407305359840393, 'eval_accuracy': 0.845, 'eval_runtime': 61.0636, 'eval_samples_per_second': 13.101, 'eval_steps_per_second': 3.275, 'epoch': 2.0}
{'train_runtime': 1866.4234, 'train_samples_per_second': 3.429, 'train_steps_per_second': 0.857, 'train_loss': 0.5134868958964944, 'epoch': 2.0}





TrainOutput(global_step=1600, training_loss=0.5134868958964944, metrics={'train_runtime': 1866.4234, 'train_samples_per_second': 3.429, 'train_steps_per_second': 0.857, 'train_loss': 0.5134868958964944, 'epoch': 2.0})

In [35]:
# Evaluate BERT model on test set
metrics = trainer.evaluate()
print(f"BERT Classification Accuracy: {metrics['eval_accuracy']:.4f}")


100%|██████████| 200/200 [00:48<00:00,  4.16it/s]

BERT Classification Accuracy: 0.8450



