# PyTorch training script
The following notebook will demonstrate how I trained TinyMistral.

# Installing libraries
We'll install any dependency that we need for this notebook.

In [1]:
# I'll also be installing one of my other packages for a model that I have private.
# This package contains nice utilities that I'd rather not code again.
!pip install --upgrade sentia datasets transformers evaluate rouge_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from evaluate import load as load_metric
import sacrebleu
from tqdm import tqdm
import math
from sentia import SENTIAForCausalLM
import wandb
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from typing import Optional
from dataclasses import dataclass, field
import torch.nn.functional as F

Collecting sentia
  Obtaining dependency information for sentia from https://files.pythonhosted.org/packages/41/fa/123cb81e3daf5259bd3e4f4849ef8e69e1dac0673dc0d0b408fe04557fb6/sentia-1.15-py3-none-any.whl.metadata
  Downloading sentia-1.15-py3-none-any.whl.metadata (2.0 kB)
Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/e2/cf/db41e572d7ed958e8679018f8190438ef700aeb501b62da9e1eed9e4d69a/datasets-2.15.0-py3-none-any.whl.metadata
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/12/dd/f17b11a93a9ca27728e12512d167eb1281c151c4c6881d3ab59eb58f4127/transformers-4.35.2-py3-none-any.whl.metadata
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollectin



# String to dtype mapping

We'll define a constant to provide an easy way to define dtypes on the command line. (This would be a bit more relevant if it was not a notebook)

In [2]:
STRING_TO_DTYPE_MAPPING = {
    "bfloat16": torch.bfloat16,
     "float32": torch.float32,
     "float16": torch.float16,
     "float64": torch.float64,
}


# Dataset classes

We'll define the dataset classes that will be used for data preprocessing

In [3]:
class ConversationDataset(Dataset):
    def __init__(self, tokenizer, max_length=512, data=None, device="cuda"):
        self.data = data
        self.device = device
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        try:
            # Most of the time I'll be using InstructMix for instruction-tuning
            user = self.data[idx]["Input"]
            assistant = self.data[idx]["Output"]
        except KeyError:
            # If I'm using MMLU for evaluation
            user = self.data[idx]["question"]
            ans_index = self.data[idx]["answer"]
            assistant = self.data[idx]["choices"][ans_index]
        
        input_text = f"<|USER|> {user} <|ASSISTANT|> {assistant} <|endoftext|>"
        target_text = f"<|USER|> {user} <|ASSISTANT|> {assistant} <|endoftext|>"
        input_ids = self.tokenizer.encode(input_text, add_special_tokens=True, max_length=self.max_length, truncation=True)
        target_ids = self.tokenizer.encode(target_text, add_special_tokens=True, max_length=self.max_length, truncation=True)
        input_ids += [self.tokenizer.pad_token_id] * (self.max_length - len(input_ids))
        target_ids += [self.tokenizer.pad_token_id] * (self.max_length - len(target_ids))

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.int64, device=self.device),
            "labels": torch.tensor(target_ids, dtype=torch.int64, device=self.device),
        }
class CompletionDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=256, device="cuda"):
        self.data = data
        self.device = device
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        input_text = f"{text} {self.tokenizer.eos_token}"
        target_text = f"{text} {self.tokenizer.eos_token}"
        input_ids = self.tokenizer.encode(input_text, add_special_tokens=True, max_length=self.max_length, truncation=True)
        target_ids = self.tokenizer.encode(target_text, add_special_tokens=True, max_length=self.max_length, truncation=True)
        input_ids += [self.tokenizer.pad_token_id] * (self.max_length - len(input_ids))
        target_ids += [self.tokenizer.pad_token_id] * (self.max_length - len(target_ids))
        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.int64, device=self.device),
            "labels": torch.tensor(target_ids, dtype=torch.int64, device=self.device),
        }

# Training loop

We'll define the training loop, with a few metrics to keep track of the model's learning.

In [4]:
def train(model, dataloader, optimizer, tokenizer, device="cuda"):
    model.train()
    model.to(device=device)
    total_loss = 0
    total_perplexity = 0

    for i, batch in tqdm(enumerate(dataloader)):
                input_ids = batch["input_ids"].to(device)
                target_ids = batch["labels"].to(device)
                # Generate the output and calculate the loss
                outputs = model(input_ids=input_ids, labels=target_ids)
                loss, logits = outputs[:2]
                # Calculate the BLEU score
                probs = F.softmax(logits, dim=-1)
                predictions = torch.argmax(probs, dim=-1)
                predictions_str = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions.tolist()]
                target_ids_str = [tokenizer.decode(tgt, skip_special_tokens=True) for tgt in target_ids.tolist()]
                print(predictions_str[0])
                bleu_scores = []
                accuracy_scores = []
                for pred_str, target_str in zip(predictions_str, target_ids_str):
                    bleu = sacrebleu.sentence_bleu(pred_str, [target_str])
                    bleu_scores.append(bleu.score)
                for pred_id, target_id in zip(predictions, target_ids):
                    accuracy = SENTIAForCausalLM.calculate_accuracy(pred_id, target_id)
                    accuracy_scores.append(accuracy)

                accuracy = sum(accuracy_scores) / len(accuracy_scores)
                bleu = sum(bleu_scores) / len(bleu_scores)
                # Calculate the reward
                # This reward can be used for RLHF, but I prefer using it as a metric.
                # The highest value is typically around 2x the sequence length.
                # The lowest value is typically about the negative of the sequence length.
                reward, penalty = SENTIAForCausalLM.get_reward(predictions.tolist()[0], target_ids.tolist()[0], bleu)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                # Update the metrics
                total_loss += loss.item()
                try:
                    wandb.log({"loss": ol.item(), "bleu": bleu, "perplexity": torch.exp(ol).item(), "accuracy": accuracy})
                except:
                    pass
                print(
                    f"Batch {i + 1}/{len(dataloader)}: Loss - {loss.item():.4f}, NetReward - {reward - penalty:.4f}, BLEU - {bleu:.4f}, Perplexity - {torch.exp(loss).item()}, Accuracy - {accuracy}")

    return total_loss / len(dataloader)

# Evaluation loop

We'll define an evaluation loop with multiple metrics to track the model's performance. We'll include scores like loss, perplexity, bleu, rouge, and f1.

In [5]:
def evaluate(model, val_loader, tokenizer, use_cuda=True):
    model.eval()
    device = torch.device('cuda' if use_cuda and torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Load metrics
    bleu_metric = load_metric('bleu')
    rouge_metric = load_metric('rouge')
    
    # Initialize variables to accumulate scores
    total_loss = 0
    all_predictions = []
    all_references = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            # Move batch to the correct device
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            # Forward pass
            outputs = model(input_ids=batch["input_ids"], labels=batch["labels"])
            loss = outputs.loss
            total_loss += loss.item()
            
            # Convert logits to predictions (for F1, BLEU, ROUGE)
            # This part depends on your model's output format and the task
            # Here is a mock-up of how you might extract predictions
            # For token classification tasks:
            # predictions = outputs.logits.argmax(dim=-1)
            # For seq2seq tasks:
            predictions = tokenizer.batch_decode(outputs.logits.argmax(dim=-1), skip_special_tokens=True)

            # Post-process batch to extract labels and predictions in a suitable format
            references = batch['labels'] 
            references = tokenizer.batch_decode(references, skip_special_tokens=True)
            
            # Update metrics
            references = [[ref] for ref in references]
            bleu_metric.add_batch(predictions=predictions, references=references)
            rouge_metric.add_batch(predictions=predictions, references=references)
            # Store predictions and references for later use if needed
            all_predictions.extend(predictions)
            all_references.extend(references)
    # Compute the metrics
    bleu_score = bleu_metric.compute(predictions=all_predictions, references=all_references)
    rouge_score = rouge_metric.compute(predictions=all_predictions, references=all_references)

    # Perplexity can be calculated from the total loss
    # For perplexity, we assume the loss is the negative log likelihood
    # In case the loss function is something else, this needs to be adjusted
    perplexity = torch.exp(torch.tensor(total_loss / len(val_loader)))

    metrics = {
        'val_loss': total_loss / len(val_loader),
        'val_perplexity': perplexity.item(),
        'val_bleu': bleu_score['bleu'],
        'val_rouge': rouge_score,
    }
    try:
        wandb.log(**metrics)
    except:
        pass

    return metrics

# TrainArgs

Here we'll define a class creates a config for the training.

In [6]:
@dataclass
class TrainArgs:
    # Model configuration
    model: str = field(default="Locutusque/TinyMistral-248M")  # Pretrained model name or path
    batch_size: int = field(default=8)
    num_epochs: int = field(default=3)
    learning_rate: float = field(default=5e-5)
    device: str = field(default="cuda" if torch.cuda.is_available() else "cpu")

    # Data loading
    dataset: str = field(default="Skylion007/openwebtext")
    datasetconfig: Optional[str] = field(default=None)  # Configuration for the dataset if required
    split: str = field(default="train")
    val_dataset: str = field(default="Skylion007/openwebtext")
    val_datasetconfig: Optional[str] = field(default=None)
    val_split: str = field(default="validation")

    # Training output
    save_dir: str = field(default="./saved_models")

    # Data type (optional, depends on your use case)
    dtype: str = field(default="float32")

# Define the main function

This is where the loop will go, and the model will be loaded and trained.

In [7]:
def main(args: TrainArgs):
    try:
        del model
    except:
        pass
    tokenizer = AutoTokenizer.from_pretrained(args.model)
    tokenizer.add_special_tokens({"additional_special_tokens": ["<|USER|>", "<|ASSISTANT|>"]})
    train_data = load_dataset(args.dataset, args.datasetconfig, split=args.split)
    val = load_dataset(args.val_dataset, args.val_datasetconfig, split=args.val_split)
    dtype = STRING_TO_DTYPE_MAPPING.get(args.dtype)
    # Uncomment this if you want to use wandb.
    #wandb.init(dir="", project="")
    train_data = ConversationDataset(tokenizer, data=train_data, max_length=256)
    val_data = ConversationDataset(tokenizer, data=val, max_length=256)
    train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=args.batch_size)

    # Initialize the model
    model = AutoModelForCausalLM.from_pretrained(args.model)
    model.resize_token_embeddings(len(tokenizer))
    model.to(args.device, dtype=dtype)
    
    # Define the optimizer
    optimizer = optim.Adamax(model.parameters(), lr=args.learning_rate)
    
    # Training and evaluation loops
    try:
        for epoch in range(args.num_epochs):
            print(f'Epoch: {epoch+1:02}')
            train_loss = train(model, train_loader, optimizer, tokenizer, args.device)
            val_metrics = evaluate(model, val_loader, tokenizer)

            print(f'Epoch: {epoch+1:02}')
            print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
            print(f'\tValidation metrics: {val_metrics}')
            
            model.save_pretrained(args.save_dir)

            # Calculate and display BLEU, accuracy, and any other desired metrics
            # You'll need to implement this part based on your specific task
    except KeyboardInterrupt:
         print("Saving and cleaning up the model...")
         print("Do NOT kill the terminal it WILL corrupt the model files")
         model.save_pretrained(args.save_dir)
         quit(0)

In [8]:
args = TrainArgs(
    split="train[100:50100]",
    val_split="train[:100]",
    dataset="Locutusque/InstructMix",
    val_dataset="Locutusque/InstructMix",
    model="Locutusque/TinyMistral-248M",
)
    
main(args)

tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/562 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/992M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Epoch: 01


0it [00:00, ?it/s]

 { =2
2


1it [00:02,  2.88s/it]

Batch 1/13: Loss - 0.4687, NetReward - 259.5000, BLEU - 2.2019, Perplexity - 1.5979504585266113, Accuracy - 0.91552734375
The The


2it [00:03,  1.79s/it]

Batch 2/13: Loss - 0.4652, NetReward - 511.2500, BLEU - 8.6004, Perplexity - 1.5922991037368774, Accuracy - 0.92138671875
The1 = = = = = = = = =1
 =


3it [00:04,  1.44s/it]

Batch 3/13: Loss - 0.8612, NetReward - 254.7500, BLEU - 3.5386, Perplexity - 2.3660149574279785, Accuracy - 0.8291015625
The



4it [00:05,  1.28s/it]

Batch 4/13: Loss - 1.5200, NetReward - 256.2500, BLEU - 2.8414, Perplexity - 4.572220325469971, Accuracy - 0.646484375
The



5it [00:06,  1.18s/it]

Batch 5/13: Loss - 1.2672, NetReward - 512.2500, BLEU - 5.6518, Perplexity - 3.5509769916534424, Accuracy - 0.7197265625
The1 = = = firstrusum
 = = =

 =


6it [00:07,  1.13s/it]

Batch 6/13: Loss - 0.6357, NetReward - 515.2500, BLEU - 10.8327, Perplexity - 1.8884326219558716, Accuracy - 0.8583984375
The_


7it [00:09,  1.10s/it]

Batch 7/13: Loss - 0.5141, NetReward - 512.2500, BLEU - 6.1759, Perplexity - 1.6721891164779663, Accuracy - 0.87646484375
The1emside,idays  .Cinger
 1999 ; 1910 ,1
1


8it [00:10,  1.08s/it]

Batch 8/13: Loss - 0.5292, NetReward - 259.7500, BLEU - 3.3201, Perplexity - 1.6975147724151611, Accuracy - 0.8759765625
The1's The'ity
 .S. W.kner , 1912


1


9it [00:11,  1.06s/it]

Batch 9/13: Loss - 0.9589, NetReward - 515.7500, BLEU - 10.4390, Perplexity - 2.6087839603424072, Accuracy - 0.7548828125
The
 1993, the firster familys - @- theen from and theothy waser was her store in  age of the building .  Woodtonum .
 was to workvise the school until and she the her her and and her a chance of needed. Sheothy was her mother wereated with the one of . The ling ofs  Love , The  Book @ ,ed book andathers the , The 1919 ,othy waser , of a heart attack , Sheer was a to attend her own , the of extent , her death's death . and she her booksg her family children wasoured into her. and she was not to find to for newint glass window . .  home's home . the. Marymund's birthday .b , , She




10it [00:12,  1.05s/it]

Batch 10/13: Loss - 1.1402, NetReward - 286.7500, BLEU - 1.1967, Perplexity - 3.1274824142456055, Accuracy - 0.72314453125
The
 Pmitized of the" by the Rock"ms" .S. . ,
 to the  same time as the as  19,1 to  1963.
lying to be A" , the  1863 , the dateathous " of which , the   of the  , "  the of the timelington Club been closed with the to theham , the thelivience to the of the of Staffdersance and and , Columbia , .
 is became the beginning of the uation of thenance . . the Rock , Arkansas the first of therendered to the Unitedancing forces government . the,arn .s  .peditionary  11, 1862 . The




11it [00:13,  1.05s/it]

Batch 11/13: Loss - 1.2610, NetReward - 280.7500, BLEU - 2.4079, Perplexity - 3.5288262367248535, Accuracy - 0.6826171875
The
er' in thecesterton in 11th 1979.  85
. She years arrangements were held at the in the.rington,, one in Ster,s homeon .' She funeralhes were found in theorrington Church and . She 1999, she wasbur ,   of theemin , , 1999 , was the bookower My , in He




12it [00:14,  1.05s/it]

Batch 12/13: Loss - 0.6822, NetReward - 523.0000, BLEU - 5.0314, Perplexity - 1.97823965549469, Accuracy - 0.79638671875
The
   19th0s, theer was to work the was a her to her children. the herself on on her subjects. She and friends were her be toular activities non works . and she later not The




13it [00:15,  1.16s/it]

Batch 13/13: Loss - 0.6531, NetReward - 260.2500, BLEU - 2.3486, Perplexity - 1.9214311838150024, Accuracy - 0.8544921875





Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Evaluating: 100%|██████████| 13/13 [00:02<00:00,  5.96it/s]


Epoch: 01
	Train Loss: 0.843 | Train PPL:   2.323
	Validation metrics: {'val_loss': 1.2361177939635057, 'val_perplexity': 3.4422242641448975, 'val_bleu': 0.037422328091953436, 'val_rouge': {'rouge1': 0.18577667792527455, 'rouge2': 0.035492463796036294, 'rougeL': 0.14399339390379115, 'rougeLsum': 0.142500923607717}}
Epoch: 02


0it [00:00, ?it/s]

The
 = =
 hetic  = = 
 


1it [00:01,  1.11s/it]

Batch 1/13: Loss - 0.8132, NetReward - 515.7500, BLEU - 8.6692, Perplexity - 2.2552084922790527, Accuracy - 0.79052734375
The


2it [00:02,  1.07s/it]

Batch 2/13: Loss - 0.5391, NetReward - 512.5000, BLEU - 11.1235, Perplexity - 1.7145174741744995, Accuracy - 0.84619140625
The
 = = = rusons = =  
 


3it [00:03,  1.05s/it]

Batch 3/13: Loss - 0.7212, NetReward - 514.7500, BLEU - 9.6425, Perplexity - 2.0568747520446777, Accuracy - 0.78271484375
The
er's ', in 1999
 but her  1960 , sheer was to the10rd Woodtonons to 10Ps , , ydon , She was her on to to .rington , andsex , and , andfriendathed her her husband andwin, , and her her 'or Maryss . She the the residence in St mother deter to deteriorate and She was the the out of hospital homes wasvalescent homes . and sheended to her . friends . She
 


4it [00:04,  1.04s/it]

Batch 4/13: Loss - 0.6285, NetReward - 528.7500, BLEU - 9.6918, Perplexity - 1.874811053276062, Accuracy - 0.81103515625
The
 Pmities of the" by the Rock"ms" .S. . ,  to the  same time as the as  19,1 to  1863.
lying to the A" , the  1863 , the dateathous " of which for the   , April  , "  the of the timelington Club been closed with sold to theham , the thelivience to the of the of Staffdersance . and of Columbia , The
 is became the beginning of the uation of thenance . . the Rock , Arkansas the first of therendered to the Unionancing forces government . the Douele .s  .pedition .  11 , 1863 . The
 


5it [00:05,  1.04s/it]

Batch 5/13: Loss - 1.2145, NetReward - 538.2500, BLEU - 7.2919, Perplexity - 3.3685615062713623, Accuracy - 0.6513671875
The


6it [00:06,  1.04s/it]

Batch 6/13: Loss - 0.8619, NetReward - 512.5000, BLEU - 7.9106, Perplexity - 2.367687940597534, Accuracy - 0.7451171875
The0
ols,1aired) for the) the)
 


7it [00:07,  1.05s/it]

Batch 7/13: Loss - 0.6135, NetReward - 258.5000, BLEU - 4.1453, Perplexity - 1.846954584121704, Accuracy - 0.8076171875



8it [00:08,  1.05s/it]

Batch 8/13: Loss - 0.6292, NetReward - 512.5000, BLEU - 8.4909, Perplexity - 1.8760173320770264, Accuracy - 0.78173828125



9it [00:09,  1.05s/it]

Batch 9/13: Loss - 0.3821, NetReward - 512.5000, BLEU - 7.5634, Perplexity - 1.4653677940368652, Accuracy - 0.8837890625



10it [00:10,  1.04s/it]

Batch 10/13: Loss - 0.4757, NetReward - 256.5000, BLEU - 3.5535, Perplexity - 1.609196424484253, Accuracy - 0.87255859375

form  
 


11it [00:11,  1.04s/it]

Batch 11/13: Loss - 0.5558, NetReward - 513.0000, BLEU - 7.4118, Perplexity - 1.7434208393096924, Accuracy - 0.802734375



12it [00:12,  1.04s/it]

Batch 12/13: Loss - 0.7422, NetReward - 256.5000, BLEU - 3.6220, Perplexity - 2.100522994995117, Accuracy - 0.755859375

 --@-ewaelite are a grouphold andelong friend on theer .
 was said that "I am not be extent a by the . I by the way sense, but in a way of style matter . the way of the of have have." .
 also that fewness for the "  of the Lett,igan , theThe early and she and the Gs , @@ , "
 


13it [00:13,  1.03s/it]

Batch 13/13: Loss - 0.4044, NetReward - 270.7500, BLEU - 1.8706, Perplexity - 1.4983537197113037, Accuracy - 0.9013671875



Evaluating: 100%|██████████| 13/13 [00:02<00:00,  6.04it/s]


Epoch: 02
	Train Loss: 0.660 | Train PPL:   1.935
	Validation metrics: {'val_loss': 1.202731552032324, 'val_perplexity': 3.3291985988616943, 'val_bleu': 0.037862437295002736, 'val_rouge': {'rouge1': 0.1924113636236676, 'rouge2': 0.0357157964800078, 'rougeL': 0.14648140597457493, 'rougeLsum': 0.14643715086711367}}
Epoch: 03


0it [00:00, ?it/s]


: a Updatedued  
 


1it [00:01,  1.06s/it]

Batch 1/13: Loss - 0.7341, NetReward - 257.2500, BLEU - 4.9300, Perplexity - 2.083533763885498, Accuracy - 0.78125

LOely Mary Anner
110 June 1893 – 29 February 1963) was an English poetator and known for her series of illustr novelsations depicting theies and fair in Sheer wass workworks was in hood and the from in workshops in the ageydon School of Art . She work work work was theeting cards and avenile drawings coversations , and her first book, Theower Fairies , the  , was published in 1911 . She works were published in   year . 
 


2it [00:02,  1.04s/it]

Batch 2/13: Loss - 0.7058, NetReward - 533.7500, BLEU - 8.2474, Perplexity - 2.0255537033081055, Accuracy - 0.73046875
 er was asally in the and and a and-@ and @-@ and. but she was also wellent at painting and-@ and @-@ . . and herils , and in herel . She also a largebook , her own theuring the and .  also wrote that "I' a been to paint aively , my way that I naturally to me . but any doubt effort or effort . detail expression . She  


3it [00:03,  1.03s/it]

Batch 3/13: Loss - 1.0852, NetReward - 528.0000, BLEU - 6.3949, Perplexity - 2.96004581451416, Accuracy - 0.65234375



4it [00:04,  1.03s/it]

Batch 4/13: Loss - 0.3484, NetReward - 512.5000, BLEU - 11.0016, Perplexity - 1.4168387651443481, Accuracy - 0.87646484375
 =er was in thething Hospital in 19 September 1963. aged 89 years . She years arrangements were held at the in the.rington and , one in Ster's Churchon . , She motherhes were found in theorrington Church , .  1919 , she wasne , a parish of theenguin Books , 1990 , was the bookower Fairy of .   


5it [00:05,  1.02s/it]

Batch 5/13: Loss - 0.5118, NetReward - 530.0000, BLEU - 5.4033, Perplexity - 1.6682847738265991, Accuracy - 0.81689453125
 = Maway  born  friend and a inspiration on the mother . Sheer wass worklike included aalgic clothing and wellaway's children . , and sheer 's children are often popularancholy and more like- appearance . and to to the in technology technology . Sheer ' the in her interestical eye and a a to her .s childrenator , who Warrant ,  with herwich , sheator and B. Williamsward , designed theer .s work .   


6it [00:06,  1.03s/it]

Batch 6/13: Loss - 0.6772, NetReward - 527.7500, BLEU - 5.6826, Perplexity - 1.9684313535690308, Accuracy - 0.75732421875
 =er wass etches and  , and illustr , her . taken to children and family friends children of children children in andated to theities causes , char organizations events , and toited in the artworks . The also the covers , booksyets , and other a of bookscards . theaphael anduck and the artistsers . as the of Bookss the Worldies ,11919 ) and andasonside Booksiday , 1916 ) , and The's The and Girl (acters ( 1918 ) 1910 ) .
 work works Fymes ( the the ( 1920 ) .  Great of the Rings ( ( ( 1929 ) were  collection of a woman who was in theans and a farmbank , and publishedically ac received by  in 1800 , thesw , herighlaceaces ,11910 ) were the a family who Mary who wasues a father from a and a use of the "y of  book of a illustr friendotsoge Mc-@-- who ". and , , , hisally illustr that youngensian version experiment .  and Dan was


7it [00:07,  1.03s/it]

Batch 7/13: Loss - 0.8725, NetReward - 295.0000, BLEU - 3.6722, Perplexity - 2.3928308486938477, Accuracy - 0.6962890625



8it [00:08,  1.03s/it]

Batch 8/13: Loss - 0.4458, NetReward - 512.5000, BLEU - 8.6532, Perplexity - 1.5617531538009644, Accuracy - 0.84326171875
 ='s The andacters  . W. Waulkner , 1912 
 


9it [00:09,  1.02s/it]

Batch 9/13: Loss - 0.3334, NetReward - 516.5000, BLEU - 8.4207, Perplexity - 1.3957371711730957, Accuracy - 0.8583984375
 =aired  painted by 
 


10it [00:10,  1.03s/it]

Batch 10/13: Loss - 0.3323, NetReward - 514.2500, BLEU - 9.5667, Perplexity - 1.3941354751586914, Accuracy - 0.88427734375



11it [00:11,  1.03s/it]

Batch 11/13: Loss - 0.4473, NetReward - 512.5000, BLEU - 7.2057, Perplexity - 1.5641283988952637, Accuracy - 0.83203125
 =er was a from in the and literature in  1930 . She 1919 , the19 , , she published the art class at the Universityydon School of Art . where began the school . the 1920s . She  , she became a teaching position at 
 


12it [00:12,  1.03s/it]

Batch 12/13: Loss - 0.3252, NetReward - 524.7500, BLEU - 10.5997, Perplexity - 1.3842436075210571, Accuracy - 0.88671875
 = = =  B-@ ed
 = =  
 


13it [00:13,  1.01s/it]

Batch 13/13: Loss - 0.8529, NetReward - 514.7500, BLEU - 15.6588, Perplexity - 2.346405267715454, Accuracy - 0.703125



Evaluating: 100%|██████████| 13/13 [00:02<00:00,  6.05it/s]


Epoch: 03
	Train Loss: 0.590 | Train PPL:   1.804
	Validation metrics: {'val_loss': 1.1854317142413213, 'val_perplexity': 3.272099256515503, 'val_bleu': 0.040601190480815126, 'val_rouge': {'rouge1': 0.1936679557912107, 'rouge2': 0.03818811964951879, 'rougeL': 0.1489430670434423, 'rougeLsum': 0.14944877531720746}}


# Main loop

This is where we'll run the main function, and start the training process.