### Importing Libraries
This block imports the necessary libraries and modules for handling datasets, tokenization, and model training using Hugging Face's Transformers library.

In [None]:
import pandas as pd  # For data manipulation
import torch  # For tensor operations
from datasets import Dataset  # For managing datasets in Hugging Face format
from transformers import (  # Hugging Face library for NLP
    BartTokenizerFast,  # Tokenizer for BART
    BartForConditionalGeneration,  # Pretrained BART model for sequence generation
    TrainingArguments,  # Configuration for training
    Trainer,  # Training wrapper
    DataCollatorForSeq2Seq,  # Prepares data batches for sequence-to-sequence tasks
)


  from .autonotebook import tqdm as notebook_tqdm


### Data Loading and Preprocessing
This block loads the dataset, verifies its integrity, ensures proper data types, and prepares a new column with task-specific prefixes for input.

In [None]:
# Load the dataset
data = pd.read_csv('finaldataset.csv')  # Read dataset from CSV file

# Display the first few rows to verify the data
print(data.head())  # Check the structure of the dataset

# Check for any missing or malformed entries
print("Checking for missing values:")
print(data.isnull().sum())  # Count missing values in each column

# Ensure all reviews are strings
data['original_review'] = data['original_review'].astype(str)  # Convert to string type
data['inverted_review'] = data['inverted_review'].astype(str)  # Convert to string type

# Add a task-specific prefix to the original reviews
data['input_text'] = 'invert polarity: ' + data['original_review']  # Add task-specific prompt

# Convert the DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(data[['input_text', 'inverted_review']])  # Keep only relevant columns


                                     original_review  original_label  \
0  Great CD After buying this CD for its hype, I ...               1   
1       Blends in seamlessly with my car’s interior.               1   
2  Buy the soundtrack instead. Joseph lovers; you...               0   
3  So cute! This hat is honestly one of the very ...               1   
4  wish i had gotten this sooner! ive got a lamp ...               1   

                                     inverted_review  inverted_label  
0  Terrible CD After buying this CD due to its hy...               0  
1    Stands out awkwardly against my car’s interior.               0  
2  This film is a delightful experience that brin...               1  
3  So ugly! This hat is honestly one of the very ...               0  
4  I regret getting this at all! The lamp in the ...               0  
Checking for missing values:
original_review    0
original_label     0
inverted_review    0
inverted_label     0
dtype: int64


### Splitting the Dataset
This block splits the dataset into training and validation sets for model training and evaluation.

In [None]:
# Split the dataset into training and validation sets
tokenized_dataset = hf_dataset.train_test_split(test_size=0.1)  # Allocate 10% of the data for validation
train_dataset = tokenized_dataset['train']  # Training subset
eval_dataset = tokenized_dataset['test']  # Validation subset

### Loading Pretrained Model and Tokenizer
This block loads the pretrained BART model and its tokenizer for sequence-to-sequence tasks.

In [None]:
# Load the tokenizer
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')  # Load BART tokenizer

# Load the pre-trained BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')  # Load BART model

### Preprocessing Function
Defines a function to tokenize input and output texts and prepare them for the model, handling padding and special tokens appropriately.

In [None]:
def preprocess_function(examples):
    inputs = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=128)  # Tokenize input
    outputs = tokenizer(examples['inverted_review'], padding='max_length', truncation=True, max_length=128)  # Tokenize output

    examples['input_ids'] = inputs.input_ids  # Save input token IDs
    examples['attention_mask'] = inputs.attention_mask  # Save attention mask
    examples['labels'] = outputs.input_ids  # Save output token IDs
    # Replace padding token IDs in labels with -100 to ignore them during loss computation
    examples['labels'] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels]
        for labels in examples['labels']
    ]
    return examples

### Applying Preprocessing
Maps the preprocessing function to the datasets and removes unnecessary columns.

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=['input_text', 'inverted_review'])  # Preprocess training data
eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=['input_text', 'inverted_review'])  # Preprocess validation data

Map: 100%|██████████| 6507/6507 [00:00<00:00, 10688.16 examples/s]
Map: 100%|██████████| 724/724 [00:00<00:00, 10220.99 examples/s]


### Formatting Datasets
This block ensures the datasets are formatted as PyTorch tensors with only relevant columns for training.

In [None]:
columns = ['input_ids', 'attention_mask', 'labels']  # Required columns
train_dataset.set_format(type='torch', columns=columns)  # Format training dataset
eval_dataset.set_format(type='torch', columns=columns)  # Format evaluation dataset

### Training Arguments
Defines training hyperparameters and configurations for the Hugging Face Trainer.

In [None]:
training_args = TrainingArguments(
    output_dir='./polarity_inversion_model',  # Directory to save model outputs
    num_train_epochs=10,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    save_strategy='epoch',  # Save model checkpoints at the end of each epoch
    learning_rate=3e-5,  # Learning rate
    warmup_steps=100,  # Warmup steps for learning rate scheduler
    logging_dir='./logs',  # Directory for logging
    logging_steps=50,  # Log every 50 steps
    save_total_limit=2,  # Maximum number of checkpoints to save
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU supports it
    report_to="none",  # Disable reporting to external systems
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Data Collator
Creates a data collator to dynamically pad batches during training and evaluation.

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)  # Prepares batches with dynamic padding

### Trainer Initialization
Initializes the Hugging Face Trainer for managing the training and evaluation loops.

In [None]:
trainer = Trainer(
    model=model,  # Model to be fine-tuned
    args=training_args,  # Training configuration
    train_dataset=train_dataset,  # Training data
    eval_dataset=eval_dataset,  # Evaluation data
    data_collator=data_collator,  # Collator for batch preparation
)

### Model Training and Saving
This block trains the model and saves the fine-tuned model and tokenizer to the specified directory.

In [None]:
# Start the fine-tuning process
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./polarity_inversion_model')  # Save model weights
tokenizer.save_pretrained('./polarity_inversion_model')  # Save tokenizer

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  0%|          | 51/16270 [00:04<25:56, 10.42it/s]

{'loss': 2.7809, 'grad_norm': 11.615219116210938, 'learning_rate': 1.4099999999999999e-05, 'epoch': 0.03}


  1%|          | 101/16270 [00:09<26:08, 10.31it/s]

{'loss': 2.0501, 'grad_norm': 8.195572853088379, 'learning_rate': 2.91e-05, 'epoch': 0.06}


  1%|          | 151/16270 [00:14<25:57, 10.35it/s]

{'loss': 1.8674, 'grad_norm': 8.171292304992676, 'learning_rate': 2.991280148423006e-05, 'epoch': 0.09}


  1%|          | 201/16270 [00:19<25:52, 10.35it/s]

{'loss': 1.7499, 'grad_norm': 6.840893745422363, 'learning_rate': 2.9820037105751393e-05, 'epoch': 0.12}


  2%|▏         | 251/16270 [00:24<25:47, 10.35it/s]

{'loss': 1.6526, 'grad_norm': 6.9245924949646, 'learning_rate': 2.972727272727273e-05, 'epoch': 0.15}


  2%|▏         | 301/16270 [00:29<25:45, 10.33it/s]

{'loss': 1.6726, 'grad_norm': 7.6575493812561035, 'learning_rate': 2.9634508348794064e-05, 'epoch': 0.18}


  2%|▏         | 351/16270 [00:33<25:37, 10.35it/s]

{'loss': 1.6755, 'grad_norm': 6.9001054763793945, 'learning_rate': 2.9541743970315397e-05, 'epoch': 0.22}


  2%|▏         | 401/16270 [00:38<25:32, 10.35it/s]

{'loss': 1.6279, 'grad_norm': 9.884418487548828, 'learning_rate': 2.9448979591836735e-05, 'epoch': 0.25}


  3%|▎         | 451/16270 [00:43<25:30, 10.34it/s]

{'loss': 1.6095, 'grad_norm': 6.422761917114258, 'learning_rate': 2.9356215213358068e-05, 'epoch': 0.28}


  3%|▎         | 501/16270 [00:48<25:31, 10.29it/s]

{'loss': 1.6444, 'grad_norm': 6.205721855163574, 'learning_rate': 2.926345083487941e-05, 'epoch': 0.31}


  3%|▎         | 551/16270 [00:53<25:18, 10.35it/s]

{'loss': 1.5335, 'grad_norm': 7.011818885803223, 'learning_rate': 2.9170686456400742e-05, 'epoch': 0.34}


  4%|▎         | 601/16270 [00:57<25:27, 10.26it/s]

{'loss': 1.5394, 'grad_norm': 5.844351768493652, 'learning_rate': 2.907792207792208e-05, 'epoch': 0.37}


  4%|▍         | 651/16270 [01:02<25:28, 10.22it/s]

{'loss': 1.6021, 'grad_norm': 6.489218711853027, 'learning_rate': 2.8985157699443413e-05, 'epoch': 0.4}


  4%|▍         | 701/16270 [01:07<25:44, 10.08it/s]

{'loss': 1.5866, 'grad_norm': 5.96297550201416, 'learning_rate': 2.889239332096475e-05, 'epoch': 0.43}


  5%|▍         | 751/16270 [01:12<25:11, 10.26it/s]

{'loss': 1.4517, 'grad_norm': 6.542981147766113, 'learning_rate': 2.8799628942486084e-05, 'epoch': 0.46}


  5%|▍         | 801/16270 [01:17<25:31, 10.10it/s]

{'loss': 1.5599, 'grad_norm': 10.244729995727539, 'learning_rate': 2.870686456400742e-05, 'epoch': 0.49}


  5%|▌         | 851/16270 [01:22<25:25, 10.11it/s]

{'loss': 1.4909, 'grad_norm': 8.010496139526367, 'learning_rate': 2.8614100185528758e-05, 'epoch': 0.52}


  6%|▌         | 901/16270 [01:27<24:55, 10.28it/s]

{'loss': 1.574, 'grad_norm': 7.918569564819336, 'learning_rate': 2.8521335807050095e-05, 'epoch': 0.55}


  6%|▌         | 951/16270 [01:32<25:05, 10.18it/s]

{'loss': 1.5256, 'grad_norm': 8.340553283691406, 'learning_rate': 2.842857142857143e-05, 'epoch': 0.58}


  6%|▌         | 1001/16270 [01:37<24:31, 10.38it/s]

{'loss': 1.5253, 'grad_norm': 7.360565185546875, 'learning_rate': 2.8337662337662336e-05, 'epoch': 0.61}


  6%|▋         | 1051/16270 [01:41<24:20, 10.42it/s]

{'loss': 1.4601, 'grad_norm': 5.696600437164307, 'learning_rate': 2.8244897959183673e-05, 'epoch': 0.65}


  7%|▋         | 1101/16270 [01:46<24:24, 10.36it/s]

{'loss': 1.4681, 'grad_norm': 8.046162605285645, 'learning_rate': 2.8152133580705007e-05, 'epoch': 0.68}


  7%|▋         | 1151/16270 [01:51<24:49, 10.15it/s]

{'loss': 1.5279, 'grad_norm': 9.427770614624023, 'learning_rate': 2.8059369202226344e-05, 'epoch': 0.71}


  7%|▋         | 1201/16270 [01:56<24:39, 10.18it/s]

{'loss': 1.481, 'grad_norm': 5.57356595993042, 'learning_rate': 2.796660482374768e-05, 'epoch': 0.74}


  8%|▊         | 1251/16270 [02:01<24:39, 10.15it/s]

{'loss': 1.5445, 'grad_norm': 5.955831050872803, 'learning_rate': 2.787384044526902e-05, 'epoch': 0.77}


  8%|▊         | 1301/16270 [02:06<24:33, 10.16it/s]

{'loss': 1.5873, 'grad_norm': 6.073838233947754, 'learning_rate': 2.7781076066790352e-05, 'epoch': 0.8}


  8%|▊         | 1351/16270 [02:11<23:56, 10.39it/s]

{'loss': 1.454, 'grad_norm': 7.869855880737305, 'learning_rate': 2.768831168831169e-05, 'epoch': 0.83}


  9%|▊         | 1401/16270 [02:15<23:53, 10.37it/s]

{'loss': 1.4138, 'grad_norm': 5.210875034332275, 'learning_rate': 2.7595547309833023e-05, 'epoch': 0.86}


  9%|▉         | 1451/16270 [02:20<23:47, 10.38it/s]

{'loss': 1.4117, 'grad_norm': 5.534940719604492, 'learning_rate': 2.750278293135436e-05, 'epoch': 0.89}


  9%|▉         | 1501/16270 [02:25<24:04, 10.22it/s]

{'loss': 1.4147, 'grad_norm': 6.453709602355957, 'learning_rate': 2.7410018552875694e-05, 'epoch': 0.92}


 10%|▉         | 1551/16270 [02:30<24:22, 10.07it/s]

{'loss': 1.4896, 'grad_norm': 8.314969062805176, 'learning_rate': 2.7317254174397034e-05, 'epoch': 0.95}


 10%|▉         | 1601/16270 [02:35<23:46, 10.29it/s]

{'loss': 1.4491, 'grad_norm': 4.738516330718994, 'learning_rate': 2.7224489795918368e-05, 'epoch': 0.98}


                                                    


{'eval_loss': 1.2732186317443848, 'eval_runtime': 2.553, 'eval_samples_per_second': 283.585, 'eval_steps_per_second': 70.896, 'epoch': 1.0}


 10%|█         | 1651/16270 [02:43<26:29,  9.20it/s]  

{'loss': 1.4132, 'grad_norm': 6.794857501983643, 'learning_rate': 2.7131725417439705e-05, 'epoch': 1.01}


 10%|█         | 1701/16270 [02:48<23:51, 10.18it/s]

{'loss': 1.2214, 'grad_norm': 4.573091506958008, 'learning_rate': 2.703896103896104e-05, 'epoch': 1.04}


 11%|█         | 1751/16270 [02:53<23:32, 10.28it/s]

{'loss': 1.3419, 'grad_norm': 5.140799045562744, 'learning_rate': 2.6946196660482376e-05, 'epoch': 1.08}


 11%|█         | 1801/16270 [02:58<23:11, 10.40it/s]

{'loss': 1.2118, 'grad_norm': 5.951702117919922, 'learning_rate': 2.685343228200371e-05, 'epoch': 1.11}


 11%|█▏        | 1851/16270 [03:03<23:20, 10.30it/s]

{'loss': 1.2615, 'grad_norm': 7.991751194000244, 'learning_rate': 2.6760667903525047e-05, 'epoch': 1.14}


 12%|█▏        | 1901/16270 [03:08<23:38, 10.13it/s]

{'loss': 1.295, 'grad_norm': 5.421994209289551, 'learning_rate': 2.6667903525046384e-05, 'epoch': 1.17}


 12%|█▏        | 1951/16270 [03:13<23:28, 10.16it/s]

{'loss': 1.3594, 'grad_norm': 5.780598163604736, 'learning_rate': 2.657513914656772e-05, 'epoch': 1.2}


 12%|█▏        | 2001/16270 [03:18<23:07, 10.28it/s]

{'loss': 1.2993, 'grad_norm': 4.890115737915039, 'learning_rate': 2.6482374768089054e-05, 'epoch': 1.23}


 13%|█▎        | 2051/16270 [03:22<23:16, 10.18it/s]

{'loss': 1.1947, 'grad_norm': 5.501354694366455, 'learning_rate': 2.638961038961039e-05, 'epoch': 1.26}


 13%|█▎        | 2101/16270 [03:27<22:59, 10.27it/s]

{'loss': 1.2733, 'grad_norm': 6.371389389038086, 'learning_rate': 2.6296846011131725e-05, 'epoch': 1.29}


 13%|█▎        | 2151/16270 [03:32<22:56, 10.26it/s]

{'loss': 1.2603, 'grad_norm': 8.058211326599121, 'learning_rate': 2.6204081632653062e-05, 'epoch': 1.32}


 14%|█▎        | 2201/16270 [03:37<23:14, 10.09it/s]

{'loss': 1.269, 'grad_norm': 8.441730499267578, 'learning_rate': 2.6111317254174396e-05, 'epoch': 1.35}


 14%|█▍        | 2251/16270 [03:42<22:44, 10.28it/s]

{'loss': 1.3488, 'grad_norm': 4.978793621063232, 'learning_rate': 2.6018552875695733e-05, 'epoch': 1.38}


 14%|█▍        | 2301/16270 [03:47<22:24, 10.39it/s]

{'loss': 1.3277, 'grad_norm': 6.332705974578857, 'learning_rate': 2.592578849721707e-05, 'epoch': 1.41}


 14%|█▍        | 2351/16270 [03:52<22:16, 10.41it/s]

{'loss': 1.2112, 'grad_norm': 4.723349094390869, 'learning_rate': 2.5833024118738407e-05, 'epoch': 1.44}


 15%|█▍        | 2401/16270 [03:56<22:04, 10.47it/s]

{'loss': 1.2645, 'grad_norm': 5.552082061767578, 'learning_rate': 2.574025974025974e-05, 'epoch': 1.48}


 15%|█▌        | 2451/16270 [04:01<21:58, 10.48it/s]

{'loss': 1.1713, 'grad_norm': 8.382494926452637, 'learning_rate': 2.5647495361781078e-05, 'epoch': 1.51}


 15%|█▌        | 2501/16270 [04:06<21:53, 10.49it/s]

{'loss': 1.2768, 'grad_norm': 8.636134147644043, 'learning_rate': 2.555473098330241e-05, 'epoch': 1.54}


 16%|█▌        | 2551/16270 [04:11<21:50, 10.47it/s]

{'loss': 1.2685, 'grad_norm': 10.258561134338379, 'learning_rate': 2.546196660482375e-05, 'epoch': 1.57}


 16%|█▌        | 2601/16270 [04:16<21:56, 10.38it/s]

{'loss': 1.2489, 'grad_norm': 7.039050102233887, 'learning_rate': 2.5369202226345082e-05, 'epoch': 1.6}


 16%|█▋        | 2651/16270 [04:20<21:41, 10.46it/s]

{'loss': 1.2403, 'grad_norm': 5.070517539978027, 'learning_rate': 2.5276437847866423e-05, 'epoch': 1.63}


 17%|█▋        | 2701/16270 [04:25<21:34, 10.48it/s]

{'loss': 1.3219, 'grad_norm': 6.071409225463867, 'learning_rate': 2.5183673469387757e-05, 'epoch': 1.66}


 17%|█▋        | 2751/16270 [04:30<21:28, 10.49it/s]

{'loss': 1.3265, 'grad_norm': 5.79597282409668, 'learning_rate': 2.509090909090909e-05, 'epoch': 1.69}


 17%|█▋        | 2801/16270 [04:35<21:34, 10.41it/s]

{'loss': 1.2744, 'grad_norm': 5.118382930755615, 'learning_rate': 2.4998144712430427e-05, 'epoch': 1.72}


 18%|█▊        | 2851/16270 [04:39<21:31, 10.39it/s]

{'loss': 1.2613, 'grad_norm': 6.523242950439453, 'learning_rate': 2.490538033395176e-05, 'epoch': 1.75}


 18%|█▊        | 2901/16270 [04:44<21:23, 10.42it/s]

{'loss': 1.2509, 'grad_norm': 4.6089768409729, 'learning_rate': 2.4812615955473098e-05, 'epoch': 1.78}


 18%|█▊        | 2951/16270 [04:49<21:16, 10.43it/s]

{'loss': 1.266, 'grad_norm': 5.071998596191406, 'learning_rate': 2.4719851576994432e-05, 'epoch': 1.81}


 18%|█▊        | 3001/16270 [04:54<21:06, 10.48it/s]

{'loss': 1.2738, 'grad_norm': 4.188558578491211, 'learning_rate': 2.4627087198515772e-05, 'epoch': 1.84}


 19%|█▉        | 3051/16270 [04:59<21:11, 10.40it/s]

{'loss': 1.2261, 'grad_norm': 4.084261417388916, 'learning_rate': 2.4534322820037106e-05, 'epoch': 1.87}


 19%|█▉        | 3101/16270 [05:03<21:01, 10.44it/s]

{'loss': 1.198, 'grad_norm': 4.7925262451171875, 'learning_rate': 2.4441558441558443e-05, 'epoch': 1.91}


 19%|█▉        | 3151/16270 [05:08<20:44, 10.54it/s]

{'loss': 1.2195, 'grad_norm': 4.639428615570068, 'learning_rate': 2.4348794063079777e-05, 'epoch': 1.94}


 20%|█▉        | 3201/16270 [05:13<20:47, 10.48it/s]

{'loss': 1.2164, 'grad_norm': 7.443498611450195, 'learning_rate': 2.4256029684601114e-05, 'epoch': 1.97}


 20%|█▉        | 3251/16270 [05:18<20:47, 10.44it/s]

{'loss': 1.2698, 'grad_norm': 6.0704522132873535, 'learning_rate': 2.4163265306122448e-05, 'epoch': 2.0}


                                                    
 20%|██        | 3254/16270 [05:20<20:46, 10.44it/s]

{'eval_loss': 1.2114999294281006, 'eval_runtime': 2.5021, 'eval_samples_per_second': 289.356, 'eval_steps_per_second': 72.339, 'epoch': 2.0}


 20%|██        | 3301/16270 [05:26<20:43, 10.43it/s]  

{'loss': 1.1621, 'grad_norm': 4.975903034210205, 'learning_rate': 2.4070500927643785e-05, 'epoch': 2.03}


 21%|██        | 3351/16270 [05:31<20:31, 10.49it/s]

{'loss': 1.0407, 'grad_norm': 5.034065246582031, 'learning_rate': 2.3977736549165122e-05, 'epoch': 2.06}


 21%|██        | 3401/16270 [05:36<20:30, 10.46it/s]

{'loss': 1.0745, 'grad_norm': 6.230437278747559, 'learning_rate': 2.388497217068646e-05, 'epoch': 2.09}


 21%|██        | 3451/16270 [05:40<20:22, 10.49it/s]

{'loss': 1.0461, 'grad_norm': 5.830694675445557, 'learning_rate': 2.3792207792207793e-05, 'epoch': 2.12}


 22%|██▏       | 3501/16270 [05:45<20:22, 10.44it/s]

{'loss': 1.1199, 'grad_norm': 4.017455577850342, 'learning_rate': 2.369944341372913e-05, 'epoch': 2.15}


 22%|██▏       | 3551/16270 [05:50<20:19, 10.43it/s]

{'loss': 1.0852, 'grad_norm': 4.310374736785889, 'learning_rate': 2.3606679035250463e-05, 'epoch': 2.18}


 22%|██▏       | 3601/16270 [05:55<20:12, 10.45it/s]

{'loss': 1.1135, 'grad_norm': 4.689793109893799, 'learning_rate': 2.35139146567718e-05, 'epoch': 2.21}


 22%|██▏       | 3651/16270 [05:59<20:08, 10.44it/s]

{'loss': 1.119, 'grad_norm': 4.094457626342773, 'learning_rate': 2.3421150278293134e-05, 'epoch': 2.24}


 23%|██▎       | 3701/16270 [06:04<20:13, 10.36it/s]

{'loss': 1.0361, 'grad_norm': 4.974456787109375, 'learning_rate': 2.332838589981447e-05, 'epoch': 2.27}


 23%|██▎       | 3751/16270 [06:09<20:03, 10.40it/s]

{'loss': 1.06, 'grad_norm': 5.954516410827637, 'learning_rate': 2.3235621521335808e-05, 'epoch': 2.3}


 23%|██▎       | 3801/16270 [06:14<19:58, 10.40it/s]

{'loss': 1.0861, 'grad_norm': 4.902896881103516, 'learning_rate': 2.3142857142857145e-05, 'epoch': 2.34}


 24%|██▎       | 3851/16270 [06:19<19:56, 10.38it/s]

{'loss': 1.1231, 'grad_norm': 4.864041328430176, 'learning_rate': 2.305009276437848e-05, 'epoch': 2.37}


 24%|██▍       | 3901/16270 [06:24<19:51, 10.38it/s]

{'loss': 1.1383, 'grad_norm': 6.26948881149292, 'learning_rate': 2.2957328385899816e-05, 'epoch': 2.4}


 24%|██▍       | 3951/16270 [06:28<19:46, 10.38it/s]

{'loss': 1.1148, 'grad_norm': 4.342827796936035, 'learning_rate': 2.286456400742115e-05, 'epoch': 2.43}


 25%|██▍       | 4001/16270 [06:33<19:38, 10.41it/s]

{'loss': 1.0278, 'grad_norm': 4.631886959075928, 'learning_rate': 2.2773654916512057e-05, 'epoch': 2.46}


 25%|██▍       | 4051/16270 [06:38<19:32, 10.42it/s]

{'loss': 1.1328, 'grad_norm': 5.232298851013184, 'learning_rate': 2.2680890538033398e-05, 'epoch': 2.49}


 25%|██▌       | 4101/16270 [06:43<19:25, 10.44it/s]

{'loss': 1.1304, 'grad_norm': 4.430607795715332, 'learning_rate': 2.258812615955473e-05, 'epoch': 2.52}


 26%|██▌       | 4151/16270 [06:47<19:26, 10.39it/s]

{'loss': 1.028, 'grad_norm': 4.235769271850586, 'learning_rate': 2.249536178107607e-05, 'epoch': 2.55}


 26%|██▌       | 4201/16270 [06:52<19:24, 10.37it/s]

{'loss': 1.0985, 'grad_norm': 5.463257789611816, 'learning_rate': 2.2402597402597402e-05, 'epoch': 2.58}


 26%|██▌       | 4251/16270 [06:57<19:11, 10.44it/s]

{'loss': 1.13, 'grad_norm': 5.163358688354492, 'learning_rate': 2.230983302411874e-05, 'epoch': 2.61}


 26%|██▋       | 4301/16270 [07:02<19:11, 10.39it/s]

{'loss': 1.1568, 'grad_norm': 6.605260848999023, 'learning_rate': 2.2217068645640073e-05, 'epoch': 2.64}


 27%|██▋       | 4351/16270 [07:07<19:08, 10.37it/s]

{'loss': 1.0879, 'grad_norm': 8.210248947143555, 'learning_rate': 2.212430426716141e-05, 'epoch': 2.67}


 27%|██▋       | 4401/16270 [07:12<19:04, 10.37it/s]

{'loss': 1.1103, 'grad_norm': 3.543281316757202, 'learning_rate': 2.2031539888682747e-05, 'epoch': 2.7}


 27%|██▋       | 4451/16270 [07:16<19:05, 10.32it/s]

{'loss': 1.0681, 'grad_norm': 6.303553581237793, 'learning_rate': 2.1938775510204084e-05, 'epoch': 2.74}


 28%|██▊       | 4501/16270 [07:21<18:48, 10.43it/s]

{'loss': 1.0766, 'grad_norm': 8.303438186645508, 'learning_rate': 2.1846011131725418e-05, 'epoch': 2.77}


 28%|██▊       | 4551/16270 [07:26<18:40, 10.45it/s]

{'loss': 1.0211, 'grad_norm': 5.263467311859131, 'learning_rate': 2.1753246753246755e-05, 'epoch': 2.8}


 28%|██▊       | 4601/16270 [07:31<18:36, 10.45it/s]

{'loss': 1.1355, 'grad_norm': 5.45673942565918, 'learning_rate': 2.166048237476809e-05, 'epoch': 2.83}


 29%|██▊       | 4651/16270 [07:35<18:37, 10.40it/s]

{'loss': 1.0836, 'grad_norm': 5.675716876983643, 'learning_rate': 2.1567717996289426e-05, 'epoch': 2.86}


 29%|██▉       | 4701/16270 [07:40<18:30, 10.41it/s]

{'loss': 1.0998, 'grad_norm': 8.799732208251953, 'learning_rate': 2.147495361781076e-05, 'epoch': 2.89}


 29%|██▉       | 4751/16270 [07:45<18:25, 10.42it/s]

{'loss': 1.0873, 'grad_norm': 5.326507568359375, 'learning_rate': 2.1382189239332097e-05, 'epoch': 2.92}


 30%|██▉       | 4801/16270 [07:50<18:18, 10.45it/s]

{'loss': 1.1033, 'grad_norm': 5.710530757904053, 'learning_rate': 2.1289424860853434e-05, 'epoch': 2.95}


 30%|██▉       | 4851/16270 [07:55<18:23, 10.35it/s]

{'loss': 1.1627, 'grad_norm': 4.405362606048584, 'learning_rate': 2.119666048237477e-05, 'epoch': 2.98}


                                                    
 30%|███       | 4881/16270 [08:00<18:06, 10.48it/s]

{'eval_loss': 1.2067826986312866, 'eval_runtime': 2.5186, 'eval_samples_per_second': 287.464, 'eval_steps_per_second': 71.866, 'epoch': 3.0}


 30%|███       | 4901/16270 [08:03<22:34,  8.39it/s]  

{'loss': 1.0269, 'grad_norm': 4.966373920440674, 'learning_rate': 2.1103896103896105e-05, 'epoch': 3.01}


 30%|███       | 4951/16270 [08:08<18:12, 10.36it/s]

{'loss': 0.9612, 'grad_norm': 6.160760402679443, 'learning_rate': 2.101113172541744e-05, 'epoch': 3.04}


 31%|███       | 5001/16270 [08:13<18:07, 10.36it/s]

{'loss': 0.9172, 'grad_norm': 4.509362697601318, 'learning_rate': 2.0918367346938775e-05, 'epoch': 3.07}


 31%|███       | 5051/16270 [08:18<17:53, 10.45it/s]

{'loss': 0.982, 'grad_norm': 6.082810878753662, 'learning_rate': 2.0825602968460112e-05, 'epoch': 3.1}


 31%|███▏      | 5101/16270 [08:22<17:45, 10.48it/s]

{'loss': 0.9392, 'grad_norm': 6.152050018310547, 'learning_rate': 2.0732838589981446e-05, 'epoch': 3.13}


 32%|███▏      | 5151/16270 [08:27<17:50, 10.39it/s]

{'loss': 0.9597, 'grad_norm': 4.451178073883057, 'learning_rate': 2.0640074211502783e-05, 'epoch': 3.17}


 32%|███▏      | 5201/16270 [08:32<17:38, 10.46it/s]

{'loss': 0.8795, 'grad_norm': 4.7636260986328125, 'learning_rate': 2.054730983302412e-05, 'epoch': 3.2}


 32%|███▏      | 5251/16270 [08:37<17:33, 10.46it/s]

{'loss': 0.9345, 'grad_norm': 5.152946949005127, 'learning_rate': 2.0454545454545454e-05, 'epoch': 3.23}


 33%|███▎      | 5301/16270 [08:42<17:35, 10.39it/s]

{'loss': 0.9439, 'grad_norm': 4.171358108520508, 'learning_rate': 2.036178107606679e-05, 'epoch': 3.26}


 33%|███▎      | 5351/16270 [08:46<17:31, 10.38it/s]

{'loss': 1.0363, 'grad_norm': 5.131450176239014, 'learning_rate': 2.0269016697588125e-05, 'epoch': 3.29}


 33%|███▎      | 5401/16270 [08:51<17:23, 10.42it/s]

{'loss': 0.9409, 'grad_norm': 9.143906593322754, 'learning_rate': 2.0176252319109462e-05, 'epoch': 3.32}


 34%|███▎      | 5451/16270 [08:56<17:21, 10.39it/s]

{'loss': 0.8865, 'grad_norm': 5.4638447761535645, 'learning_rate': 2.0083487940630796e-05, 'epoch': 3.35}


 34%|███▍      | 5501/16270 [09:01<17:14, 10.41it/s]

{'loss': 0.9338, 'grad_norm': 5.968716621398926, 'learning_rate': 1.9990723562152136e-05, 'epoch': 3.38}


 34%|███▍      | 5551/16270 [09:06<17:11, 10.40it/s]

{'loss': 0.8883, 'grad_norm': 4.650004863739014, 'learning_rate': 1.989795918367347e-05, 'epoch': 3.41}


 34%|███▍      | 5601/16270 [09:10<17:06, 10.39it/s]

{'loss': 0.9887, 'grad_norm': 5.747798442840576, 'learning_rate': 1.9805194805194807e-05, 'epoch': 3.44}


 35%|███▍      | 5651/16270 [09:15<17:02, 10.39it/s]

{'loss': 0.9195, 'grad_norm': 6.0018534660339355, 'learning_rate': 1.971243042671614e-05, 'epoch': 3.47}


 35%|███▌      | 5701/16270 [09:20<16:57, 10.39it/s]

{'loss': 1.0277, 'grad_norm': 5.281290531158447, 'learning_rate': 1.9619666048237478e-05, 'epoch': 3.5}


 35%|███▌      | 5751/16270 [09:25<16:49, 10.42it/s]

{'loss': 0.9656, 'grad_norm': 9.502103805541992, 'learning_rate': 1.952690166975881e-05, 'epoch': 3.53}


 36%|███▌      | 5801/16270 [09:30<16:45, 10.41it/s]

{'loss': 0.9545, 'grad_norm': 4.876263618469238, 'learning_rate': 1.943413729128015e-05, 'epoch': 3.56}


 36%|███▌      | 5851/16270 [09:34<16:33, 10.49it/s]

{'loss': 1.0187, 'grad_norm': 7.214212894439697, 'learning_rate': 1.9341372912801485e-05, 'epoch': 3.6}


 36%|███▋      | 5901/16270 [09:39<16:28, 10.49it/s]

{'loss': 0.9843, 'grad_norm': 5.915482521057129, 'learning_rate': 1.9248608534322823e-05, 'epoch': 3.63}


 37%|███▋      | 5951/16270 [09:44<16:23, 10.50it/s]

{'loss': 0.9734, 'grad_norm': 5.765779495239258, 'learning_rate': 1.9155844155844156e-05, 'epoch': 3.66}


 37%|███▋      | 6001/16270 [09:49<16:21, 10.46it/s]

{'loss': 0.9706, 'grad_norm': 5.481149196624756, 'learning_rate': 1.9063079777365493e-05, 'epoch': 3.69}


 37%|███▋      | 6051/16270 [09:53<16:21, 10.41it/s]

{'loss': 0.9385, 'grad_norm': 5.117895126342773, 'learning_rate': 1.8970315398886827e-05, 'epoch': 3.72}


 37%|███▋      | 6101/16270 [09:58<16:10, 10.48it/s]

{'loss': 1.0255, 'grad_norm': 5.8244242668151855, 'learning_rate': 1.8877551020408164e-05, 'epoch': 3.75}


 38%|███▊      | 6151/16270 [10:03<16:03, 10.50it/s]

{'loss': 0.9804, 'grad_norm': 6.2468438148498535, 'learning_rate': 1.8784786641929498e-05, 'epoch': 3.78}


 38%|███▊      | 6201/16270 [10:08<16:09, 10.39it/s]

{'loss': 1.0135, 'grad_norm': 5.513153553009033, 'learning_rate': 1.8692022263450835e-05, 'epoch': 3.81}


 38%|███▊      | 6251/16270 [10:13<16:04, 10.39it/s]

{'loss': 1.0, 'grad_norm': 5.216897964477539, 'learning_rate': 1.8599257884972172e-05, 'epoch': 3.84}


 39%|███▊      | 6301/16270 [10:17<15:59, 10.39it/s]

{'loss': 0.9244, 'grad_norm': 5.384924411773682, 'learning_rate': 1.850649350649351e-05, 'epoch': 3.87}


 39%|███▉      | 6351/16270 [10:22<15:51, 10.42it/s]

{'loss': 0.9664, 'grad_norm': 5.2002105712890625, 'learning_rate': 1.8413729128014843e-05, 'epoch': 3.9}


 39%|███▉      | 6401/16270 [10:27<15:50, 10.39it/s]

{'loss': 0.9499, 'grad_norm': 5.469965934753418, 'learning_rate': 1.832096474953618e-05, 'epoch': 3.93}


 40%|███▉      | 6451/16270 [10:32<15:41, 10.42it/s]

{'loss': 0.9895, 'grad_norm': 4.626544952392578, 'learning_rate': 1.8228200371057514e-05, 'epoch': 3.96}


 40%|███▉      | 6501/16270 [10:36<15:35, 10.44it/s]

{'loss': 1.0122, 'grad_norm': 3.681521415710449, 'learning_rate': 1.813543599257885e-05, 'epoch': 4.0}


                                                    
 40%|████      | 6508/16270 [10:40<15:36, 10.42it/s]

{'eval_loss': 1.2116026878356934, 'eval_runtime': 2.5258, 'eval_samples_per_second': 286.645, 'eval_steps_per_second': 71.661, 'epoch': 4.0}


 40%|████      | 6551/16270 [10:45<15:33, 10.41it/s]  

{'loss': 0.9099, 'grad_norm': 5.979097366333008, 'learning_rate': 1.8042671614100184e-05, 'epoch': 4.03}


 41%|████      | 6601/16270 [10:50<15:30, 10.39it/s]

{'loss': 0.9574, 'grad_norm': 7.027499675750732, 'learning_rate': 1.7949907235621525e-05, 'epoch': 4.06}


 41%|████      | 6651/16270 [10:55<15:26, 10.39it/s]

{'loss': 0.8415, 'grad_norm': 6.052661895751953, 'learning_rate': 1.785714285714286e-05, 'epoch': 4.09}


 41%|████      | 6701/16270 [10:59<15:23, 10.36it/s]

{'loss': 0.9089, 'grad_norm': 5.2338151931762695, 'learning_rate': 1.7764378478664196e-05, 'epoch': 4.12}


 41%|████▏     | 6751/16270 [11:04<15:17, 10.38it/s]

{'loss': 0.8155, 'grad_norm': 5.332066059112549, 'learning_rate': 1.767161410018553e-05, 'epoch': 4.15}


 42%|████▏     | 6801/16270 [11:09<15:13, 10.37it/s]

{'loss': 0.8516, 'grad_norm': 5.146416187286377, 'learning_rate': 1.7578849721706866e-05, 'epoch': 4.18}


 42%|████▏     | 6851/16270 [11:14<15:00, 10.46it/s]

{'loss': 0.8449, 'grad_norm': 5.533736705780029, 'learning_rate': 1.74860853432282e-05, 'epoch': 4.21}


 42%|████▏     | 6901/16270 [11:19<15:00, 10.40it/s]

{'loss': 0.7925, 'grad_norm': 5.092403888702393, 'learning_rate': 1.7393320964749534e-05, 'epoch': 4.24}


 43%|████▎     | 6951/16270 [11:23<15:05, 10.29it/s]

{'loss': 0.8249, 'grad_norm': 4.333446979522705, 'learning_rate': 1.7300556586270874e-05, 'epoch': 4.27}


 43%|████▎     | 7001/16270 [11:28<14:55, 10.35it/s]

{'loss': 0.9109, 'grad_norm': 5.115786552429199, 'learning_rate': 1.7207792207792208e-05, 'epoch': 4.3}


 43%|████▎     | 7051/16270 [11:33<14:49, 10.36it/s]

{'loss': 0.8612, 'grad_norm': 7.103363037109375, 'learning_rate': 1.7115027829313545e-05, 'epoch': 4.33}


 44%|████▎     | 7101/16270 [11:38<14:41, 10.41it/s]

{'loss': 0.8875, 'grad_norm': 5.455411911010742, 'learning_rate': 1.702226345083488e-05, 'epoch': 4.36}


 44%|████▍     | 7151/16270 [11:43<14:42, 10.33it/s]

{'loss': 0.8515, 'grad_norm': 5.289163589477539, 'learning_rate': 1.6929499072356216e-05, 'epoch': 4.39}


 44%|████▍     | 7201/16270 [11:47<14:29, 10.43it/s]

{'loss': 0.8366, 'grad_norm': 3.7006499767303467, 'learning_rate': 1.683673469387755e-05, 'epoch': 4.43}


 45%|████▍     | 7251/16270 [11:52<14:26, 10.41it/s]

{'loss': 0.8306, 'grad_norm': 4.461051940917969, 'learning_rate': 1.6743970315398887e-05, 'epoch': 4.46}


 45%|████▍     | 7301/16270 [11:57<14:22, 10.40it/s]

{'loss': 0.8983, 'grad_norm': 6.118541240692139, 'learning_rate': 1.665120593692022e-05, 'epoch': 4.49}


 45%|████▌     | 7351/16270 [12:02<14:15, 10.42it/s]

{'loss': 0.9081, 'grad_norm': 5.990533828735352, 'learning_rate': 1.655844155844156e-05, 'epoch': 4.52}


 45%|████▌     | 7401/16270 [12:07<14:09, 10.44it/s]

{'loss': 0.8464, 'grad_norm': 6.084194183349609, 'learning_rate': 1.6465677179962894e-05, 'epoch': 4.55}


 46%|████▌     | 7451/16270 [12:11<14:01, 10.48it/s]

{'loss': 0.8108, 'grad_norm': 5.374527454376221, 'learning_rate': 1.637291280148423e-05, 'epoch': 4.58}


 46%|████▌     | 7501/16270 [12:16<14:00, 10.43it/s]

{'loss': 0.8374, 'grad_norm': 4.064593315124512, 'learning_rate': 1.6280148423005565e-05, 'epoch': 4.61}


 46%|████▋     | 7551/16270 [12:21<14:00, 10.37it/s]

{'loss': 0.8856, 'grad_norm': 3.9163427352905273, 'learning_rate': 1.6187384044526902e-05, 'epoch': 4.64}


 47%|████▋     | 7601/16270 [12:26<13:56, 10.36it/s]

{'loss': 0.9377, 'grad_norm': 8.47920036315918, 'learning_rate': 1.6094619666048236e-05, 'epoch': 4.67}


 47%|████▋     | 7651/16270 [12:31<13:44, 10.46it/s]

{'loss': 0.852, 'grad_norm': 5.478064060211182, 'learning_rate': 1.6001855287569573e-05, 'epoch': 4.7}


 47%|████▋     | 7701/16270 [12:35<13:41, 10.43it/s]

{'loss': 0.916, 'grad_norm': 4.839848041534424, 'learning_rate': 1.590909090909091e-05, 'epoch': 4.73}


 48%|████▊     | 7751/16270 [12:40<13:33, 10.48it/s]

{'loss': 0.843, 'grad_norm': 5.982937335968018, 'learning_rate': 1.5816326530612247e-05, 'epoch': 4.76}


 48%|████▊     | 7801/16270 [12:45<13:31, 10.44it/s]

{'loss': 0.8709, 'grad_norm': 4.796107769012451, 'learning_rate': 1.572356215213358e-05, 'epoch': 4.79}


 48%|████▊     | 7851/16270 [12:50<13:26, 10.44it/s]

{'loss': 0.8523, 'grad_norm': 5.508295059204102, 'learning_rate': 1.5630797773654918e-05, 'epoch': 4.82}


 49%|████▊     | 7901/16270 [12:55<13:22, 10.43it/s]

{'loss': 0.9187, 'grad_norm': 5.150069713592529, 'learning_rate': 1.5538033395176252e-05, 'epoch': 4.86}


 49%|████▉     | 7951/16270 [12:59<13:17, 10.43it/s]

{'loss': 0.8667, 'grad_norm': 6.068060398101807, 'learning_rate': 1.544526901669759e-05, 'epoch': 4.89}


 49%|████▉     | 8001/16270 [13:04<13:14, 10.41it/s]

{'loss': 0.9089, 'grad_norm': 9.155350685119629, 'learning_rate': 1.5352504638218923e-05, 'epoch': 4.92}


 49%|████▉     | 8051/16270 [13:09<13:09, 10.41it/s]

{'loss': 0.8377, 'grad_norm': 5.468790054321289, 'learning_rate': 1.5259740259740263e-05, 'epoch': 4.95}


 50%|████▉     | 8101/16270 [13:14<13:00, 10.46it/s]

{'loss': 0.9068, 'grad_norm': 4.284388065338135, 'learning_rate': 1.5166975881261595e-05, 'epoch': 4.98}


                                                    
 50%|█████     | 8135/16270 [13:19<12:53, 10.52it/s]

{'eval_loss': 1.221666932106018, 'eval_runtime': 2.5168, 'eval_samples_per_second': 287.669, 'eval_steps_per_second': 71.917, 'epoch': 5.0}


 50%|█████     | 8151/16270 [13:22<19:06,  7.08it/s]  

{'loss': 0.8142, 'grad_norm': 5.712055206298828, 'learning_rate': 1.5074211502782934e-05, 'epoch': 5.01}


 50%|█████     | 8201/16270 [13:27<12:56, 10.40it/s]

{'loss': 0.7918, 'grad_norm': 4.088535785675049, 'learning_rate': 1.4981447124304267e-05, 'epoch': 5.04}


 51%|█████     | 8251/16270 [13:32<12:52, 10.39it/s]

{'loss': 0.785, 'grad_norm': 6.175699234008789, 'learning_rate': 1.4888682745825603e-05, 'epoch': 5.07}


 51%|█████     | 8301/16270 [13:37<12:46, 10.39it/s]

{'loss': 0.7539, 'grad_norm': 3.855654239654541, 'learning_rate': 1.4795918367346938e-05, 'epoch': 5.1}


 51%|█████▏    | 8351/16270 [13:41<12:43, 10.37it/s]

{'loss': 0.8055, 'grad_norm': 4.640708923339844, 'learning_rate': 1.4703153988868275e-05, 'epoch': 5.13}


 52%|█████▏    | 8401/16270 [13:46<12:35, 10.42it/s]

{'loss': 0.74, 'grad_norm': 6.308600902557373, 'learning_rate': 1.461038961038961e-05, 'epoch': 5.16}


 52%|█████▏    | 8451/16270 [13:51<12:34, 10.37it/s]

{'loss': 0.8436, 'grad_norm': 5.488581657409668, 'learning_rate': 1.4517625231910946e-05, 'epoch': 5.19}


 52%|█████▏    | 8501/16270 [13:56<12:26, 10.41it/s]

{'loss': 0.7358, 'grad_norm': 4.576596736907959, 'learning_rate': 1.4424860853432281e-05, 'epoch': 5.22}


 53%|█████▎    | 8551/16270 [14:01<12:20, 10.43it/s]

{'loss': 0.7919, 'grad_norm': 5.057453155517578, 'learning_rate': 1.4332096474953619e-05, 'epoch': 5.26}


 53%|█████▎    | 8601/16270 [14:05<12:17, 10.40it/s]

{'loss': 0.7378, 'grad_norm': 3.9984447956085205, 'learning_rate': 1.4239332096474954e-05, 'epoch': 5.29}


 53%|█████▎    | 8651/16270 [14:10<12:10, 10.43it/s]

{'loss': 0.7938, 'grad_norm': 6.173676490783691, 'learning_rate': 1.414656771799629e-05, 'epoch': 5.32}


 53%|█████▎    | 8701/16270 [14:15<12:09, 10.38it/s]

{'loss': 0.7217, 'grad_norm': 5.591365337371826, 'learning_rate': 1.4053803339517626e-05, 'epoch': 5.35}


 54%|█████▍    | 8751/16270 [14:20<12:01, 10.42it/s]

{'loss': 0.7362, 'grad_norm': 4.600505828857422, 'learning_rate': 1.3961038961038962e-05, 'epoch': 5.38}


 54%|█████▍    | 8801/16270 [14:25<11:59, 10.39it/s]

{'loss': 0.821, 'grad_norm': 5.7982001304626465, 'learning_rate': 1.3868274582560297e-05, 'epoch': 5.41}


 54%|█████▍    | 8851/16270 [14:29<11:56, 10.36it/s]

{'loss': 0.7803, 'grad_norm': 5.168809413909912, 'learning_rate': 1.3775510204081633e-05, 'epoch': 5.44}


 55%|█████▍    | 8901/16270 [14:34<11:50, 10.38it/s]

{'loss': 0.7978, 'grad_norm': 4.396413803100586, 'learning_rate': 1.368274582560297e-05, 'epoch': 5.47}


 55%|█████▌    | 8951/16270 [14:39<11:46, 10.37it/s]

{'loss': 0.8269, 'grad_norm': 5.0808634757995605, 'learning_rate': 1.3589981447124305e-05, 'epoch': 5.5}


 55%|█████▌    | 9001/16270 [14:44<11:37, 10.42it/s]

{'loss': 0.7968, 'grad_norm': 4.258156776428223, 'learning_rate': 1.349721706864564e-05, 'epoch': 5.53}


 56%|█████▌    | 9051/16270 [14:49<11:33, 10.40it/s]

{'loss': 0.8228, 'grad_norm': 4.949164390563965, 'learning_rate': 1.3404452690166976e-05, 'epoch': 5.56}


 56%|█████▌    | 9101/16270 [14:53<11:32, 10.35it/s]

{'loss': 0.7876, 'grad_norm': 6.1845831871032715, 'learning_rate': 1.3311688311688313e-05, 'epoch': 5.59}


 56%|█████▌    | 9151/16270 [14:58<11:23, 10.42it/s]

{'loss': 0.793, 'grad_norm': 5.294000625610352, 'learning_rate': 1.3218923933209648e-05, 'epoch': 5.62}


 57%|█████▋    | 9201/16270 [15:03<11:22, 10.35it/s]

{'loss': 0.7932, 'grad_norm': 3.3792002201080322, 'learning_rate': 1.3126159554730984e-05, 'epoch': 5.65}


 57%|█████▋    | 9251/16270 [15:08<11:18, 10.34it/s]

{'loss': 0.755, 'grad_norm': 5.380223274230957, 'learning_rate': 1.303339517625232e-05, 'epoch': 5.69}


 57%|█████▋    | 9301/16270 [15:13<11:09, 10.41it/s]

{'loss': 0.7704, 'grad_norm': 6.528711318969727, 'learning_rate': 1.2940630797773656e-05, 'epoch': 5.72}


 57%|█████▋    | 9351/16270 [15:17<11:07, 10.36it/s]

{'loss': 0.8265, 'grad_norm': 4.635975360870361, 'learning_rate': 1.2847866419294992e-05, 'epoch': 5.75}


 58%|█████▊    | 9401/16270 [15:22<10:58, 10.44it/s]

{'loss': 0.6974, 'grad_norm': 5.122495651245117, 'learning_rate': 1.2755102040816325e-05, 'epoch': 5.78}


 58%|█████▊    | 9451/16270 [15:27<10:59, 10.34it/s]

{'loss': 0.7417, 'grad_norm': 5.392216205596924, 'learning_rate': 1.2662337662337662e-05, 'epoch': 5.81}


 58%|█████▊    | 9501/16270 [15:32<10:48, 10.44it/s]

{'loss': 0.821, 'grad_norm': 4.257260322570801, 'learning_rate': 1.2569573283858998e-05, 'epoch': 5.84}


 59%|█████▊    | 9551/16270 [15:37<10:49, 10.34it/s]

{'loss': 0.8291, 'grad_norm': 4.8820390701293945, 'learning_rate': 1.2476808905380333e-05, 'epoch': 5.87}


 59%|█████▉    | 9601/16270 [15:41<10:39, 10.43it/s]

{'loss': 0.7333, 'grad_norm': 5.585369110107422, 'learning_rate': 1.238404452690167e-05, 'epoch': 5.9}


 59%|█████▉    | 9651/16270 [15:46<10:37, 10.38it/s]

{'loss': 0.7846, 'grad_norm': 6.046340465545654, 'learning_rate': 1.2291280148423006e-05, 'epoch': 5.93}


 60%|█████▉    | 9701/16270 [15:51<10:34, 10.36it/s]

{'loss': 0.8238, 'grad_norm': 4.86208438873291, 'learning_rate': 1.2198515769944341e-05, 'epoch': 5.96}


 60%|█████▉    | 9751/16270 [15:56<10:27, 10.39it/s]

{'loss': 0.8311, 'grad_norm': 4.630396366119385, 'learning_rate': 1.2105751391465676e-05, 'epoch': 5.99}


                                                    
 60%|██████    | 9762/16270 [15:59<10:24, 10.43it/s]

{'eval_loss': 1.2286611795425415, 'eval_runtime': 2.5132, 'eval_samples_per_second': 288.073, 'eval_steps_per_second': 72.018, 'epoch': 6.0}


 60%|██████    | 9801/16270 [16:04<10:28, 10.29it/s]  

{'loss': 0.7533, 'grad_norm': 6.0495500564575195, 'learning_rate': 1.2012987012987014e-05, 'epoch': 6.02}


 61%|██████    | 9851/16270 [16:09<10:14, 10.45it/s]

{'loss': 0.7499, 'grad_norm': 5.924208641052246, 'learning_rate': 1.1920222634508349e-05, 'epoch': 6.05}


 61%|██████    | 9901/16270 [16:14<10:12, 10.40it/s]

{'loss': 0.7141, 'grad_norm': 4.458508014678955, 'learning_rate': 1.1827458256029684e-05, 'epoch': 6.08}


 61%|██████    | 9951/16270 [16:19<10:07, 10.40it/s]

{'loss': 0.7062, 'grad_norm': 4.852433681488037, 'learning_rate': 1.173469387755102e-05, 'epoch': 6.12}


 61%|██████▏   | 10001/16270 [16:23<10:03, 10.39it/s]

{'loss': 0.7382, 'grad_norm': 6.080296039581299, 'learning_rate': 1.1641929499072357e-05, 'epoch': 6.15}


 62%|██████▏   | 10051/16270 [16:28<09:59, 10.38it/s]

{'loss': 0.6687, 'grad_norm': 4.737798690795898, 'learning_rate': 1.1549165120593692e-05, 'epoch': 6.18}


 62%|██████▏   | 10101/16270 [16:33<09:56, 10.35it/s]

{'loss': 0.6722, 'grad_norm': 4.7755818367004395, 'learning_rate': 1.1456400742115028e-05, 'epoch': 6.21}


 62%|██████▏   | 10151/16270 [16:38<09:47, 10.41it/s]

{'loss': 0.7213, 'grad_norm': 7.466798305511475, 'learning_rate': 1.1363636363636365e-05, 'epoch': 6.24}


 63%|██████▎   | 10201/16270 [16:43<09:43, 10.41it/s]

{'loss': 0.7743, 'grad_norm': 5.721119403839111, 'learning_rate': 1.12708719851577e-05, 'epoch': 6.27}


 63%|██████▎   | 10251/16270 [16:47<09:38, 10.40it/s]

{'loss': 0.7486, 'grad_norm': 5.222634315490723, 'learning_rate': 1.1178107606679035e-05, 'epoch': 6.3}


 63%|██████▎   | 10301/16270 [16:52<09:31, 10.45it/s]

{'loss': 0.7227, 'grad_norm': 6.031162261962891, 'learning_rate': 1.108534322820037e-05, 'epoch': 6.33}


 64%|██████▎   | 10351/16270 [16:57<09:30, 10.38it/s]

{'loss': 0.734, 'grad_norm': 6.614114761352539, 'learning_rate': 1.0992578849721708e-05, 'epoch': 6.36}


 64%|██████▍   | 10401/16270 [17:02<09:20, 10.47it/s]

{'loss': 0.6737, 'grad_norm': 5.742454528808594, 'learning_rate': 1.0899814471243043e-05, 'epoch': 6.39}


 64%|██████▍   | 10451/16270 [17:07<09:16, 10.46it/s]

{'loss': 0.7338, 'grad_norm': 4.60693883895874, 'learning_rate': 1.0807050092764379e-05, 'epoch': 6.42}


 65%|██████▍   | 10501/16270 [17:11<09:12, 10.45it/s]

{'loss': 0.7192, 'grad_norm': 7.384335517883301, 'learning_rate': 1.0714285714285714e-05, 'epoch': 6.45}


 65%|██████▍   | 10551/16270 [17:16<09:08, 10.43it/s]

{'loss': 0.6833, 'grad_norm': 6.384156703948975, 'learning_rate': 1.0621521335807051e-05, 'epoch': 6.48}


 65%|██████▌   | 10601/16270 [17:21<09:06, 10.37it/s]

{'loss': 0.7305, 'grad_norm': 4.068350315093994, 'learning_rate': 1.0528756957328387e-05, 'epoch': 6.52}


 65%|██████▌   | 10651/16270 [17:26<08:57, 10.46it/s]

{'loss': 0.715, 'grad_norm': 6.228622913360596, 'learning_rate': 1.0435992578849722e-05, 'epoch': 6.55}


 66%|██████▌   | 10701/16270 [17:31<08:57, 10.36it/s]

{'loss': 0.7611, 'grad_norm': 5.997067451477051, 'learning_rate': 1.0343228200371059e-05, 'epoch': 6.58}


 66%|██████▌   | 10751/16270 [17:35<08:47, 10.46it/s]

{'loss': 0.7436, 'grad_norm': 5.4619855880737305, 'learning_rate': 1.0250463821892394e-05, 'epoch': 6.61}


 66%|██████▋   | 10801/16270 [17:40<08:43, 10.44it/s]

{'loss': 0.7141, 'grad_norm': 7.876067161560059, 'learning_rate': 1.015769944341373e-05, 'epoch': 6.64}


 67%|██████▋   | 10851/16270 [17:45<08:40, 10.41it/s]

{'loss': 0.7051, 'grad_norm': 5.613064289093018, 'learning_rate': 1.0064935064935065e-05, 'epoch': 6.67}


 67%|██████▋   | 10901/16270 [17:50<08:32, 10.47it/s]

{'loss': 0.6862, 'grad_norm': 5.672866344451904, 'learning_rate': 9.972170686456402e-06, 'epoch': 6.7}


 67%|██████▋   | 10951/16270 [17:54<08:32, 10.39it/s]

{'loss': 0.7161, 'grad_norm': 5.0871381759643555, 'learning_rate': 9.879406307977738e-06, 'epoch': 6.73}


 68%|██████▊   | 11001/16270 [17:59<08:26, 10.40it/s]

{'loss': 0.7756, 'grad_norm': 4.468475818634033, 'learning_rate': 9.786641929499073e-06, 'epoch': 6.76}


 68%|██████▊   | 11051/16270 [18:04<08:19, 10.45it/s]

{'loss': 0.6993, 'grad_norm': 5.818033695220947, 'learning_rate': 9.693877551020408e-06, 'epoch': 6.79}


 68%|██████▊   | 11101/16270 [18:09<08:13, 10.47it/s]

{'loss': 0.7415, 'grad_norm': 4.6216535568237305, 'learning_rate': 9.601113172541746e-06, 'epoch': 6.82}


 69%|██████▊   | 11151/16270 [18:14<08:13, 10.37it/s]

{'loss': 0.747, 'grad_norm': 7.512811183929443, 'learning_rate': 9.50834879406308e-06, 'epoch': 6.85}


 69%|██████▉   | 11201/16270 [18:18<08:12, 10.29it/s]

{'loss': 0.7759, 'grad_norm': 5.606546401977539, 'learning_rate': 9.415584415584415e-06, 'epoch': 6.88}


 69%|██████▉   | 11252/16270 [18:23<08:02, 10.41it/s]

{'loss': 0.7063, 'grad_norm': 7.2589545249938965, 'learning_rate': 9.322820037105752e-06, 'epoch': 6.91}


 69%|██████▉   | 11302/16270 [18:28<07:56, 10.42it/s]

{'loss': 0.7369, 'grad_norm': 4.423707485198975, 'learning_rate': 9.230055658627087e-06, 'epoch': 6.95}


 70%|██████▉   | 11352/16270 [18:33<07:54, 10.37it/s]

{'loss': 0.744, 'grad_norm': 4.993800163269043, 'learning_rate': 9.137291280148422e-06, 'epoch': 6.98}


                                                     
 70%|███████   | 11389/16270 [18:39<07:46, 10.46it/s]

{'eval_loss': 1.2407673597335815, 'eval_runtime': 2.5141, 'eval_samples_per_second': 287.971, 'eval_steps_per_second': 71.993, 'epoch': 7.0}


 70%|███████   | 11402/16270 [18:42<13:04,  6.21it/s]

{'loss': 0.6935, 'grad_norm': 5.056506633758545, 'learning_rate': 9.044526901669758e-06, 'epoch': 7.01}


 70%|███████   | 11452/16270 [18:46<07:42, 10.41it/s]

{'loss': 0.6289, 'grad_norm': 6.222658634185791, 'learning_rate': 8.951762523191095e-06, 'epoch': 7.04}


 71%|███████   | 11502/16270 [18:51<07:36, 10.44it/s]

{'loss': 0.6528, 'grad_norm': 4.143347263336182, 'learning_rate': 8.85899814471243e-06, 'epoch': 7.07}


 71%|███████   | 11552/16270 [18:56<07:31, 10.45it/s]

{'loss': 0.7173, 'grad_norm': 4.835549831390381, 'learning_rate': 8.766233766233766e-06, 'epoch': 7.1}


 71%|███████▏  | 11602/16270 [19:01<07:28, 10.42it/s]

{'loss': 0.6644, 'grad_norm': 5.052938938140869, 'learning_rate': 8.673469387755101e-06, 'epoch': 7.13}


 72%|███████▏  | 11652/16270 [19:06<07:24, 10.40it/s]

{'loss': 0.6317, 'grad_norm': 3.9148471355438232, 'learning_rate': 8.580705009276438e-06, 'epoch': 7.16}


 72%|███████▏  | 11702/16270 [19:10<07:18, 10.41it/s]

{'loss': 0.611, 'grad_norm': 5.213162899017334, 'learning_rate': 8.487940630797774e-06, 'epoch': 7.19}


 72%|███████▏  | 11752/16270 [19:15<07:13, 10.43it/s]

{'loss': 0.6705, 'grad_norm': 6.393876075744629, 'learning_rate': 8.395176252319109e-06, 'epoch': 7.22}


 73%|███████▎  | 11802/16270 [19:20<07:07, 10.44it/s]

{'loss': 0.6337, 'grad_norm': 6.580448150634766, 'learning_rate': 8.302411873840446e-06, 'epoch': 7.25}


 73%|███████▎  | 11852/16270 [19:25<07:01, 10.48it/s]

{'loss': 0.6691, 'grad_norm': 4.937806606292725, 'learning_rate': 8.209647495361781e-06, 'epoch': 7.28}


 73%|███████▎  | 11902/16270 [19:29<06:56, 10.48it/s]

{'loss': 0.6693, 'grad_norm': 5.098639965057373, 'learning_rate': 8.116883116883117e-06, 'epoch': 7.31}


 73%|███████▎  | 11952/16270 [19:34<06:55, 10.38it/s]

{'loss': 0.7126, 'grad_norm': 3.481826066970825, 'learning_rate': 8.024118738404452e-06, 'epoch': 7.34}


 74%|███████▍  | 12002/16270 [19:39<06:48, 10.46it/s]

{'loss': 0.6851, 'grad_norm': 5.399585247039795, 'learning_rate': 7.93135435992579e-06, 'epoch': 7.38}


 74%|███████▍  | 12052/16270 [19:44<06:42, 10.47it/s]

{'loss': 0.6655, 'grad_norm': 4.436145305633545, 'learning_rate': 7.840445269016697e-06, 'epoch': 7.41}


 74%|███████▍  | 12102/16270 [19:48<06:39, 10.44it/s]

{'loss': 0.7029, 'grad_norm': 4.675042152404785, 'learning_rate': 7.747680890538032e-06, 'epoch': 7.44}


 75%|███████▍  | 12152/16270 [19:53<06:35, 10.41it/s]

{'loss': 0.6553, 'grad_norm': 4.151533126831055, 'learning_rate': 7.65491651205937e-06, 'epoch': 7.47}


 75%|███████▍  | 12202/16270 [19:58<06:29, 10.45it/s]

{'loss': 0.6204, 'grad_norm': 4.8491716384887695, 'learning_rate': 7.562152133580705e-06, 'epoch': 7.5}


 75%|███████▌  | 12252/16270 [20:03<06:23, 10.47it/s]

{'loss': 0.6598, 'grad_norm': 5.450109004974365, 'learning_rate': 7.469387755102041e-06, 'epoch': 7.53}


 76%|███████▌  | 12302/16270 [20:08<06:20, 10.43it/s]

{'loss': 0.6554, 'grad_norm': 4.402227401733398, 'learning_rate': 7.376623376623376e-06, 'epoch': 7.56}


 76%|███████▌  | 12352/16270 [20:12<06:16, 10.42it/s]

{'loss': 0.6842, 'grad_norm': 7.477687358856201, 'learning_rate': 7.283858998144713e-06, 'epoch': 7.59}


 76%|███████▌  | 12402/16270 [20:17<06:10, 10.45it/s]

{'loss': 0.6644, 'grad_norm': 6.338891983032227, 'learning_rate': 7.191094619666048e-06, 'epoch': 7.62}


 77%|███████▋  | 12452/16270 [20:22<06:06, 10.42it/s]

{'loss': 0.7508, 'grad_norm': 5.466665744781494, 'learning_rate': 7.098330241187384e-06, 'epoch': 7.65}


 77%|███████▋  | 12502/16270 [20:27<06:01, 10.42it/s]

{'loss': 0.6614, 'grad_norm': 4.686848163604736, 'learning_rate': 7.00556586270872e-06, 'epoch': 7.68}


 77%|███████▋  | 12552/16270 [20:32<05:56, 10.44it/s]

{'loss': 0.671, 'grad_norm': 5.129816055297852, 'learning_rate': 6.912801484230056e-06, 'epoch': 7.71}


 77%|███████▋  | 12602/16270 [20:36<05:52, 10.41it/s]

{'loss': 0.6958, 'grad_norm': 5.5803751945495605, 'learning_rate': 6.820037105751391e-06, 'epoch': 7.74}


 78%|███████▊  | 12652/16270 [20:41<05:47, 10.41it/s]

{'loss': 0.6754, 'grad_norm': 6.622759819030762, 'learning_rate': 6.7272727272727275e-06, 'epoch': 7.78}


 78%|███████▊  | 12702/16270 [20:46<05:42, 10.42it/s]

{'loss': 0.6726, 'grad_norm': 4.601315498352051, 'learning_rate': 6.634508348794064e-06, 'epoch': 7.81}


 78%|███████▊  | 12752/16270 [20:51<05:36, 10.45it/s]

{'loss': 0.6559, 'grad_norm': 2.9322926998138428, 'learning_rate': 6.541743970315399e-06, 'epoch': 7.84}


 79%|███████▊  | 12802/16270 [20:55<05:32, 10.42it/s]

{'loss': 0.6754, 'grad_norm': 4.638143539428711, 'learning_rate': 6.448979591836735e-06, 'epoch': 7.87}


 79%|███████▉  | 12852/16270 [21:00<05:28, 10.39it/s]

{'loss': 0.6792, 'grad_norm': 5.298969745635986, 'learning_rate': 6.356215213358071e-06, 'epoch': 7.9}


 79%|███████▉  | 12902/16270 [21:05<05:25, 10.35it/s]

{'loss': 0.5846, 'grad_norm': 4.560816287994385, 'learning_rate': 6.263450834879407e-06, 'epoch': 7.93}


 80%|███████▉  | 12952/16270 [21:10<05:19, 10.38it/s]

{'loss': 0.7, 'grad_norm': 5.677850246429443, 'learning_rate': 6.170686456400742e-06, 'epoch': 7.96}


 80%|███████▉  | 13002/16270 [21:15<05:15, 10.37it/s]

{'loss': 0.6817, 'grad_norm': 5.4256439208984375, 'learning_rate': 6.077922077922079e-06, 'epoch': 7.99}


                                                     
 80%|████████  | 13016/16270 [21:19<05:10, 10.48it/s]

{'eval_loss': 1.260547399520874, 'eval_runtime': 2.5229, 'eval_samples_per_second': 286.974, 'eval_steps_per_second': 71.743, 'epoch': 8.0}


 80%|████████  | 13052/16270 [21:23<05:14, 10.25it/s]

{'loss': 0.6019, 'grad_norm': 4.906295299530029, 'learning_rate': 5.985157699443414e-06, 'epoch': 8.02}


 81%|████████  | 13102/16270 [21:28<05:05, 10.37it/s]

{'loss': 0.6513, 'grad_norm': 8.606898307800293, 'learning_rate': 5.892393320964749e-06, 'epoch': 8.05}


 81%|████████  | 13152/16270 [21:33<05:00, 10.39it/s]

{'loss': 0.6325, 'grad_norm': 5.613394260406494, 'learning_rate': 5.799628942486085e-06, 'epoch': 8.08}


 81%|████████  | 13202/16270 [21:38<04:55, 10.40it/s]

{'loss': 0.6595, 'grad_norm': 6.077009201049805, 'learning_rate': 5.706864564007421e-06, 'epoch': 8.11}


 81%|████████▏ | 13252/16270 [21:42<04:50, 10.38it/s]

{'loss': 0.6024, 'grad_norm': 4.341039657592773, 'learning_rate': 5.614100185528757e-06, 'epoch': 8.14}


 82%|████████▏ | 13302/16270 [21:47<04:46, 10.37it/s]

{'loss': 0.6199, 'grad_norm': 4.173720359802246, 'learning_rate': 5.521335807050093e-06, 'epoch': 8.17}


 82%|████████▏ | 13352/16270 [21:52<04:40, 10.40it/s]

{'loss': 0.667, 'grad_norm': 3.3980650901794434, 'learning_rate': 5.428571428571429e-06, 'epoch': 8.21}


 82%|████████▏ | 13402/16270 [21:57<04:36, 10.37it/s]

{'loss': 0.6613, 'grad_norm': 4.509989261627197, 'learning_rate': 5.335807050092764e-06, 'epoch': 8.24}


 83%|████████▎ | 13452/16270 [22:02<04:31, 10.39it/s]

{'loss': 0.676, 'grad_norm': 5.035059452056885, 'learning_rate': 5.2430426716141005e-06, 'epoch': 8.27}


 83%|████████▎ | 13502/16270 [22:06<04:26, 10.38it/s]

{'loss': 0.6563, 'grad_norm': 4.686277389526367, 'learning_rate': 5.150278293135436e-06, 'epoch': 8.3}


 83%|████████▎ | 13552/16270 [22:11<04:21, 10.39it/s]

{'loss': 0.6709, 'grad_norm': 5.907585144042969, 'learning_rate': 5.057513914656772e-06, 'epoch': 8.33}


 84%|████████▎ | 13602/16270 [22:16<04:16, 10.40it/s]

{'loss': 0.6524, 'grad_norm': 5.3694233894348145, 'learning_rate': 4.9647495361781075e-06, 'epoch': 8.36}


 84%|████████▍ | 13652/16270 [22:21<04:11, 10.41it/s]

{'loss': 0.646, 'grad_norm': 4.933590888977051, 'learning_rate': 4.871985157699444e-06, 'epoch': 8.39}


 84%|████████▍ | 13702/16270 [22:26<04:06, 10.41it/s]

{'loss': 0.6511, 'grad_norm': 5.065260887145996, 'learning_rate': 4.779220779220779e-06, 'epoch': 8.42}


 85%|████████▍ | 13752/16270 [22:30<04:00, 10.47it/s]

{'loss': 0.5494, 'grad_norm': 5.63603401184082, 'learning_rate': 4.686456400742115e-06, 'epoch': 8.45}


 85%|████████▍ | 13802/16270 [22:35<03:56, 10.46it/s]

{'loss': 0.6308, 'grad_norm': 4.361560344696045, 'learning_rate': 4.593692022263452e-06, 'epoch': 8.48}


 85%|████████▌ | 13852/16270 [22:40<03:51, 10.44it/s]

{'loss': 0.6269, 'grad_norm': 5.1691107749938965, 'learning_rate': 4.500927643784787e-06, 'epoch': 8.51}


 85%|████████▌ | 13902/16270 [22:45<03:46, 10.45it/s]

{'loss': 0.5974, 'grad_norm': 5.629035472869873, 'learning_rate': 4.408163265306123e-06, 'epoch': 8.54}


 86%|████████▌ | 13952/16270 [22:50<03:41, 10.49it/s]

{'loss': 0.6073, 'grad_norm': 7.016727447509766, 'learning_rate': 4.315398886827458e-06, 'epoch': 8.57}


 86%|████████▌ | 14002/16270 [22:54<03:37, 10.42it/s]

{'loss': 0.5963, 'grad_norm': 5.4588541984558105, 'learning_rate': 4.222634508348794e-06, 'epoch': 8.6}


 86%|████████▋ | 14052/16270 [22:59<03:31, 10.47it/s]

{'loss': 0.6, 'grad_norm': 4.897109031677246, 'learning_rate': 4.1298701298701294e-06, 'epoch': 8.64}


 87%|████████▋ | 14102/16270 [23:04<03:26, 10.52it/s]

{'loss': 0.6134, 'grad_norm': 4.79812479019165, 'learning_rate': 4.037105751391466e-06, 'epoch': 8.67}


 87%|████████▋ | 14152/16270 [23:09<03:22, 10.46it/s]

{'loss': 0.6224, 'grad_norm': 5.747678756713867, 'learning_rate': 3.944341372912801e-06, 'epoch': 8.7}


 87%|████████▋ | 14202/16270 [23:13<03:09, 10.93it/s]

{'loss': 0.5932, 'grad_norm': 5.031090259552002, 'learning_rate': 3.853432282003711e-06, 'epoch': 8.73}


 88%|████████▊ | 14252/16270 [23:18<03:12, 10.50it/s]

{'loss': 0.6357, 'grad_norm': 7.549820423126221, 'learning_rate': 3.7606679035250465e-06, 'epoch': 8.76}


 88%|████████▊ | 14302/16270 [23:23<03:06, 10.53it/s]

{'loss': 0.6243, 'grad_norm': 4.934276580810547, 'learning_rate': 3.6697588126159557e-06, 'epoch': 8.79}


 88%|████████▊ | 14352/16270 [23:28<03:03, 10.43it/s]

{'loss': 0.6861, 'grad_norm': 4.636702060699463, 'learning_rate': 3.5769944341372915e-06, 'epoch': 8.82}


 89%|████████▊ | 14402/16270 [23:32<02:58, 10.48it/s]

{'loss': 0.6044, 'grad_norm': 4.411250114440918, 'learning_rate': 3.4842300556586273e-06, 'epoch': 8.85}


 89%|████████▉ | 14452/16270 [23:37<02:54, 10.45it/s]

{'loss': 0.6519, 'grad_norm': 8.0920991897583, 'learning_rate': 3.391465677179963e-06, 'epoch': 8.88}


 89%|████████▉ | 14502/16270 [23:42<02:48, 10.46it/s]

{'loss': 0.6003, 'grad_norm': 6.293148040771484, 'learning_rate': 3.298701298701299e-06, 'epoch': 8.91}


 89%|████████▉ | 14552/16270 [23:47<02:43, 10.48it/s]

{'loss': 0.6182, 'grad_norm': 3.47755765914917, 'learning_rate': 3.2059369202226347e-06, 'epoch': 8.94}


 90%|████████▉ | 14602/16270 [23:52<02:39, 10.49it/s]

{'loss': 0.6049, 'grad_norm': 5.558206558227539, 'learning_rate': 3.11317254174397e-06, 'epoch': 8.97}


                                                     
 90%|█████████ | 14643/16270 [23:58<02:27, 11.00it/s]

{'eval_loss': 1.2751212120056152, 'eval_runtime': 2.5062, 'eval_samples_per_second': 288.889, 'eval_steps_per_second': 72.222, 'epoch': 9.0}


 90%|█████████ | 14652/16270 [24:00<06:00,  4.49it/s]

{'loss': 0.6267, 'grad_norm': 4.972342491149902, 'learning_rate': 3.0222634508348793e-06, 'epoch': 9.0}


 90%|█████████ | 14702/16270 [24:05<02:29, 10.46it/s]

{'loss': 0.604, 'grad_norm': 5.05445671081543, 'learning_rate': 2.9294990723562156e-06, 'epoch': 9.04}


 91%|█████████ | 14752/16270 [24:09<02:24, 10.48it/s]

{'loss': 0.5938, 'grad_norm': 4.13041353225708, 'learning_rate': 2.8367346938775514e-06, 'epoch': 9.07}


 91%|█████████ | 14802/16270 [24:14<02:20, 10.44it/s]

{'loss': 0.5991, 'grad_norm': 5.616197109222412, 'learning_rate': 2.743970315398887e-06, 'epoch': 9.1}


 91%|█████████▏| 14852/16270 [24:19<02:15, 10.46it/s]

{'loss': 0.6101, 'grad_norm': 4.926839828491211, 'learning_rate': 2.6512059369202226e-06, 'epoch': 9.13}


 92%|█████████▏| 14902/16270 [24:24<02:10, 10.47it/s]

{'loss': 0.5921, 'grad_norm': 6.152791500091553, 'learning_rate': 2.5584415584415584e-06, 'epoch': 9.16}


 92%|█████████▏| 14952/16270 [24:28<02:05, 10.50it/s]

{'loss': 0.6205, 'grad_norm': 4.715744495391846, 'learning_rate': 2.4656771799628942e-06, 'epoch': 9.19}


 92%|█████████▏| 15002/16270 [24:33<02:01, 10.47it/s]

{'loss': 0.5833, 'grad_norm': 4.068018436431885, 'learning_rate': 2.37291280148423e-06, 'epoch': 9.22}


 93%|█████████▎| 15052/16270 [24:38<01:56, 10.45it/s]

{'loss': 0.5629, 'grad_norm': 2.8739304542541504, 'learning_rate': 2.280148423005566e-06, 'epoch': 9.25}


 93%|█████████▎| 15102/16270 [24:43<01:51, 10.44it/s]

{'loss': 0.601, 'grad_norm': 4.58984899520874, 'learning_rate': 2.1873840445269017e-06, 'epoch': 9.28}


 93%|█████████▎| 15152/16270 [24:47<01:46, 10.47it/s]

{'loss': 0.5926, 'grad_norm': 5.247560977935791, 'learning_rate': 2.0946196660482375e-06, 'epoch': 9.31}


 93%|█████████▎| 15202/16270 [24:52<01:42, 10.47it/s]

{'loss': 0.6129, 'grad_norm': 5.055922508239746, 'learning_rate': 2.0018552875695737e-06, 'epoch': 9.34}


 94%|█████████▎| 15252/16270 [24:57<01:37, 10.48it/s]

{'loss': 0.61, 'grad_norm': 5.149010181427002, 'learning_rate': 1.909090909090909e-06, 'epoch': 9.37}


 94%|█████████▍| 15302/16270 [25:02<01:32, 10.44it/s]

{'loss': 0.6166, 'grad_norm': 5.056394100189209, 'learning_rate': 1.816326530612245e-06, 'epoch': 9.4}


 94%|█████████▍| 15352/16270 [25:07<01:27, 10.45it/s]

{'loss': 0.5985, 'grad_norm': 4.749068737030029, 'learning_rate': 1.7235621521335807e-06, 'epoch': 9.43}


 95%|█████████▍| 15402/16270 [25:11<01:23, 10.35it/s]

{'loss': 0.5963, 'grad_norm': 4.074717998504639, 'learning_rate': 1.6307977736549165e-06, 'epoch': 9.47}


 95%|█████████▍| 15452/16270 [25:16<01:18, 10.36it/s]

{'loss': 0.6144, 'grad_norm': 4.208230018615723, 'learning_rate': 1.5380333951762524e-06, 'epoch': 9.5}


 95%|█████████▌| 15502/16270 [25:21<01:14, 10.35it/s]

{'loss': 0.5947, 'grad_norm': 3.816561222076416, 'learning_rate': 1.4452690166975882e-06, 'epoch': 9.53}


 96%|█████████▌| 15552/16270 [25:26<01:09, 10.37it/s]

{'loss': 0.6378, 'grad_norm': 5.977999687194824, 'learning_rate': 1.352504638218924e-06, 'epoch': 9.56}


 96%|█████████▌| 15602/16270 [25:31<01:04, 10.34it/s]

{'loss': 0.5942, 'grad_norm': 3.8486745357513428, 'learning_rate': 1.2597402597402598e-06, 'epoch': 9.59}


 96%|█████████▌| 15652/16270 [25:36<00:59, 10.34it/s]

{'loss': 0.619, 'grad_norm': 5.609817028045654, 'learning_rate': 1.1669758812615956e-06, 'epoch': 9.62}


 97%|█████████▋| 15702/16270 [25:40<00:54, 10.36it/s]

{'loss': 0.604, 'grad_norm': 8.024988174438477, 'learning_rate': 1.0742115027829314e-06, 'epoch': 9.65}


 97%|█████████▋| 15752/16270 [25:45<00:50, 10.35it/s]

{'loss': 0.6093, 'grad_norm': 5.572363376617432, 'learning_rate': 9.814471243042672e-07, 'epoch': 9.68}


 97%|█████████▋| 15802/16270 [25:50<00:45, 10.33it/s]

{'loss': 0.5938, 'grad_norm': 5.655058860778809, 'learning_rate': 8.886827458256031e-07, 'epoch': 9.71}


 97%|█████████▋| 15852/16270 [25:55<00:40, 10.35it/s]

{'loss': 0.6019, 'grad_norm': 4.858628749847412, 'learning_rate': 7.959183673469388e-07, 'epoch': 9.74}


 98%|█████████▊| 15902/16270 [26:00<00:35, 10.33it/s]

{'loss': 0.628, 'grad_norm': 4.380483150482178, 'learning_rate': 7.031539888682746e-07, 'epoch': 9.77}


 98%|█████████▊| 15952/16270 [26:04<00:30, 10.31it/s]

{'loss': 0.6047, 'grad_norm': 4.522301197052002, 'learning_rate': 6.103896103896104e-07, 'epoch': 9.8}


 98%|█████████▊| 16002/16270 [26:09<00:25, 10.34it/s]

{'loss': 0.542, 'grad_norm': 4.903183460235596, 'learning_rate': 5.176252319109462e-07, 'epoch': 9.83}


 99%|█████████▊| 16052/16270 [26:14<00:21, 10.35it/s]

{'loss': 0.6651, 'grad_norm': 4.590135097503662, 'learning_rate': 4.24860853432282e-07, 'epoch': 9.86}


 99%|█████████▉| 16102/16270 [26:19<00:16, 10.30it/s]

{'loss': 0.569, 'grad_norm': 9.10566520690918, 'learning_rate': 3.3209647495361784e-07, 'epoch': 9.9}


 99%|█████████▉| 16152/16270 [26:24<00:11, 10.36it/s]

{'loss': 0.5734, 'grad_norm': 3.7469115257263184, 'learning_rate': 2.393320964749536e-07, 'epoch': 9.93}


100%|█████████▉| 16202/16270 [26:29<00:06, 10.34it/s]

{'loss': 0.6095, 'grad_norm': 6.199322700500488, 'learning_rate': 1.4656771799628942e-07, 'epoch': 9.96}


100%|█████████▉| 16252/16270 [26:33<00:01, 10.45it/s]

{'loss': 0.6058, 'grad_norm': 5.392483711242676, 'learning_rate': 5.380333951762523e-08, 'epoch': 9.99}


                                                     
100%|██████████| 16270/16270 [26:39<00:00, 10.55it/s]

{'eval_loss': 1.2823090553283691, 'eval_runtime': 2.4658, 'eval_samples_per_second': 293.614, 'eval_steps_per_second': 73.404, 'epoch': 10.0}


100%|██████████| 16270/16270 [26:40<00:00, 10.16it/s]


{'train_runtime': 1600.713, 'train_samples_per_second': 40.651, 'train_steps_per_second': 10.164, 'train_loss': 0.9200849690170616, 'epoch': 10.0}


('./polarity_inversion_model/tokenizer_config.json',
 './polarity_inversion_model/special_tokens_map.json',
 './polarity_inversion_model/vocab.json',
 './polarity_inversion_model/merges.txt',
 './polarity_inversion_model/added_tokens.json',
 './polarity_inversion_model/tokenizer.json')

### Polarity Inversion Function
Defines a function to invert the polarity of a given review using the fine-tuned model.

In [None]:
# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Check device availability
model.to(device)  # Move model to appropriate device

def invert_polarity(review):
    # Add the task-specific prefix
    input_text = "invert polarity: " + review.strip()  # Add prefix to the review
    
    # Encode the input text
    input_ids = tokenizer.encode(
        input_text, return_tensors='pt', truncation=True, max_length=128
    ).to(device)  # Convert input text to token IDs
    
    # Generate the output using the model
    outputs = model.generate(
        input_ids,
        max_length=128,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=2,
    )  # Generate tokens for inverted polarity
    
    # Decode the generated tokens to get the inverted review
    inverted_review = tokenizer.decode(outputs[0], skip_special_tokens=True)  # Decode token IDs to text
    return inverted_review  # Return the inverted review

### Testing Polarity Inversion
Tests the fine-tuned model on a list of sample reviews and displays the inverted polarity results.

In [None]:
# Sample reviews to test
test_reviews = [
    "The product was bad",
    "This was the worst experience I've ever had.",
    "The service at this restaurant was fantastic and the food was delicious.",
    "I wouldn't recommend this to anyone. Complete waste of money.",
    "An amazing performance by the lead actor. Truly captivating!",
    "The software crashes frequently and is full of bugs.",
    "I am extremely satisfied with my purchase. Great value for the price.",
    "Terrible customer service. They were rude and unhelpful.",
    "Best vacation ever! The hotel staff was friendly and the amenities were top-notch.",
    "The book was bad.",
]

# Invert the polarity of each review and print the results
for review in test_reviews:
    inverted = invert_polarity(review)  # Get the inverted polarity
    print(f"Original Review: {review}")  # Print the original review
    print(f"Inverted Review: {inverted}")  # Print the inverted review

Original Review: The product was bad
Inverted Review: The product was fantastic!
--------------------------------------------------------------------------------
Original Review: This was the worst experience I've ever had.
Inverted Review: This was one of the best experiences I've ever had.
--------------------------------------------------------------------------------
Original Review: The service at this restaurant was fantastic and the food was delicious.
Inverted Review: The service at this restaurant was terrible and the food was awful.
--------------------------------------------------------------------------------
Original Review: I wouldn't recommend this to anyone. Complete waste of money.
Inverted Review: I highly recommend this to everyone. A fantastic investment of money!
--------------------------------------------------------------------------------
Original Review: An amazing performance by the lead actor. Truly captivating!
Inverted Review: A terrible performance by th

### Perplexity Computation
Calculates perplexity on test data to evaluate model performance.

In [None]:
# Compute Perplexity on Test Data

import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    BartTokenizerFast,
    BartForConditionalGeneration,
)
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm

# Load the fine-tuned model and tokenizer
tokenizer = BartTokenizerFast.from_pretrained('./polarity_inversion_model')
model = BartForConditionalGeneration.from_pretrained('./polarity_inversion_model')

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Load the dataset
data = pd.read_csv('finaldataset.csv')

# Ensure all reviews are strings
data['original_review'] = data['original_review'].astype(str)
data['inverted_review'] = data['inverted_review'].astype(str)

# Add a task-specific prefix to the original reviews
data['input_text'] = 'invert polarity: ' + data['original_review']

# Convert the DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(data[['input_text', 'inverted_review']])

# Split the dataset into training and test sets
tokenized_dataset = hf_dataset.train_test_split(test_size=0.1)
test_dataset = tokenized_dataset['test']

def preprocess_function(examples):
    inputs = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=128)
    outputs = tokenizer(examples['inverted_review'], padding='max_length', truncation=True, max_length=128)

    examples['input_ids'] = inputs.input_ids
    examples['attention_mask'] = inputs.attention_mask
    examples['labels'] = outputs.input_ids
    # Replace padding token id's of the labels by -100 so they are ignored in the loss computation
    examples['labels'] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels]
        for labels in examples['labels']
    ]
    return examples

# Preprocess the test dataset
test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=['input_text', 'inverted_review'])

columns = ['input_ids', 'attention_mask', 'labels']
test_dataset.set_format(type='torch', columns=columns)

# Create DataLoader
test_loader = DataLoader(test_dataset, batch_size=4)

# Set model to evaluation mode
model.eval()

total_loss = 0.0
total_tokens = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Computing Perplexity"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss  # This is the average loss over the batch

        # Number of tokens in labels that are not -100
        num_tokens = (labels != -100).sum().item()
        
        total_loss += loss.item() * num_tokens  # Multiply batch loss by number of tokens
        total_tokens += num_tokens

average_loss = total_loss / total_tokens
perplexity = np.exp(average_loss)

print(f"Total Loss: {total_loss:.4f}")
print(f"Total Tokens: {total_tokens}")
print(f"Average Loss per token: {average_loss:.4f}")
print(f"Perplexity: {perplexity:.4f}")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 724/724 [00:00<00:00, 9564.33 examples/s]
Computing Perplexity: 100%|██████████| 181/181 [00:04<00:00, 43.62it/s]

Total Loss: 27789.5926
Total Tokens: 56738
Average Loss per token: 0.4898
Perplexity: 1.6320



