# Training Notebook for the LM-Fluency model

The code is from *The Summary Loop* https://github.com/CannyLab/summary_loop

### Load data

In [11]:
import pandas as pd

wiki_paragraphs = pd.read_csv("../data/all_wiki_paragraphs.csv")

In [12]:
wiki_paragraphs = wiki_paragraphs.sample(frac = 1)
wiki_paragraphs.head()

Unnamed: 0,title,paragraph
687486,Johann Theodor von Scheffer,Großen politischen Einfluss gewann Scheffer un...
786382,Krankenhaus West (Stralsund),"Die Klinikumskirche, das Gebäude 20 der Anlage..."
77804,Washington Capitals,Im NHL Entry Draft 2004 durften die Capitals a...
1046720,Gerrit Willemsz Horst,"Im ""Lexikon der holländischen Stillebenmaler"" ..."
131753,Kloster Mariastein,"Eine Legende berichtet, dass ein kleiner Hirte..."


Split datasets

In [26]:
train_d = wiki_paragraphs['paragraph'][:int(len(wiki_paragraphs)*0.9)].to_numpy()
eval_d = wiki_paragraphs['paragraph'][int(len(wiki_paragraphs)*0.9):].to_numpy()

### Training

Load model

In [None]:
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers.optimization import AdamW

# Loading Model
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
model = AutoModelWithLMHead.from_pretrained("dbmdz/bert-base-german-cased").to('cuda')
model.train()

Prepare optimizer

In [8]:
import torch
import numpy as np

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
scaler = torch.cuda.amp.GradScaler()

Train the model

In [None]:
import wandb, os

wandb.init(project="Fluency Finetune")
wandb.run.name = "test"
wandb.run.save()

split_size = len(train_d) / 8

eval_loss_min = 10000000

for n_epoch in range(5):
    b_idx = 0
    for batch in np.array_split(train_d, split_size):
        b_idx += 1
        print("Training Batch #", b_idx)

        # tokenize
        inputs = tokenizer(batch.tolist(), return_tensors="pt", truncation=True, padding="longest")
        inputs['labels'] = inputs["input_ids"].detach().clone()

        # randomly choosing words to mask
        rand = torch.rand(inputs.input_ids.shape)
        mask_arr = (rand < 0.15) * (inputs.input_ids != 0) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 103)

        # mask words
        for i in range(len(inputs.input_ids)):
            for j in range(len(inputs.input_ids[i])):
                if mask_arr[i][j]:
                    inputs.input_ids[i][j] = 104

        inputs.to("cuda")

        with torch.autocast("cuda"):
            outputs = model(**inputs)
            loss = outputs.loss

        scaler.scale(loss).backward()
        
        if b_idx%5 == 0: # optimize every 5 steps
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            
        wandb.log({'train_loss': loss})
        
        torch.cuda.empty_cache()

        if b_idx%20 == 0:
            
            with torch.no_grad():
                e_idx = 0
                losses = 0
                for i in range(5):
                    e_idx += 1
                    
                    eval_texts = eval_d[0:10].tolist()
                    
                    with torch.autocast("cuda"):
                        # tokenize
                        inputs = tokenizer(eval_texts, return_tensors="pt", truncation=True, padding="longest")
                        inputs['labels'] = inputs["input_ids"].detach().clone()

                        rand = torch.rand(inputs.input_ids.shape)
                        mask_arr = (rand < 0.15) * (inputs.input_ids != 0) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 103)

                        # mask words
                        for i in range(len(inputs.input_ids)):
                            for j in range(len(inputs.input_ids[i])):
                                if mask_arr[i][j]:
                                    inputs.input_ids[i][j] = 104
                        inputs.to("cuda")
                        with torch.autocast("cuda"):
                            outputs = model(**inputs)
                            loss = outputs.loss
                    losses += loss

                losses /= e_idx
                print("Eval Loss: %.3f" % (losses))

            if losses < eval_loss_min:
                eval_loss_min = losses
                model_output_file = os.path.join("E:/models/", "distilbert_wiki_finetune.bin")
                torch.save(model.state_dict(), model_output_file)

            wandb.log({'eval_loss': loss})
            