## Training for distilbert transformer
Follow this tutorial :https://huggingface.co/transformers/custom_datasets.html

In [None]:
import pandas as pd 
from transformers import DistilBertTokenizerFast, Trainer, TrainingArguments, DistilBertForSequenceClassification 
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score,precision_recall_fscore_support
from torch.nn.functional import softmax
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader



In [None]:
train_file = "../data/label_data.csv"
test_file = "../data/label_data.csv"
LABEL_COL = "class"
TEXT_COL = "comment"

## Three utilities functions for transformers

In [None]:
def read_data(fname:str, lower_case: bool=False) ->pd.DataFrame:
        """
        This function will read the textfiles.

        fname will be out of new_train_data.csv, unlabeled_data.csv and test_data.txt

        """
        try:
            df = pd.read_csv(fname, encoding = "UTF-8", usecols = ["class","comment"])
            df[LABEL_COL]= df[LABEL_COL].replace({"negative":0, "neutral":1, "positive":2})
            if lower_case:
                df[TEXT_COL]= df[TEXT_COL].str.lower()

            return df
        except (FileNotFoundError,PermissionError):

            print("No files found. Check the data directory for files.")

In [None]:
from transformer_utils import customDataset

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall}

In [None]:
def split_train_eval(df:pd.DataFrame) -> dict:
        # splits to train, val text and labels
        
        # currently all are pd.Series
        train_df,val_df,train_label,val_label = train_test_split(df["Truth"],
                                                                 df["class"], 
                                                                 random_state = 42, 
                                                                 test_size = 0.2)
        
        # change all to lists, as inputs to tokenizer has to be 
        # text input must of type `str` (single example), 
        #`List[str]` (batch or single pretokenized example) or 
        #`List[List[str]]` (batch of pretokenized examples).
        
        train_list = train_df.tolist()
        val_list = val_df.tolist()
        label_list = train_label.tolist()
        val_label_list = val_label.tolist()
        
        return {"list of training examples":train_list,
                "list of val examples":val_list,
                "list of training labels":label_list,
                "list of val labels ":val_label_list}

## Load pretrained models and tokenizer

In [None]:
test_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels = 3)

In [None]:
test_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [None]:
test_df = read_data(test_file, lower_case = True)
labels = test_df[LABEL_COL]

## Before fine-tuning 

When you call distilbert for sequence classification, you create an additional head on top of the distilbert model. 
If you don't do any finetuning, the weights of the additional head you create is randomize. THis means that it will perform terribly on our data. 
We will check the metrics of distilbert that is not fine-tuned and see how it performs. 

In [None]:
test_dataset = customDataset(test_encodings, labels)
dataloader = DataLoader(dataset = test_dataset, batch_size = 4, shuffle=True)


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")
device

In [None]:
test_model.to(device)

## Evaluating pre-trained distilbert (not fine tuned)

In [None]:
# set model.to(device)
# model output is logits 
# return df["score"]
# softmax all the logits to class
# return df["pred"]

test_model.eval()

# empty list for prediction to be appended into  
pred_list = []
with torch.no_grad():  # so will not update 
    
    # tqdm for progress bar
    loop = tqdm(enumerate(dataloader), total = len(dataloader),
                        leave = True)
    for _, data in loop:     
        input_ids = data["input_ids"].to(device)
        masks = data["attention_mask"].to(device)
        labels = data["labels"].to(device)
        
        outputs = test_model(input_ids, masks, labels = labels)
        # information about model outputs: https://huggingface.co/transformers/main_classes/output.html

        # sample output: 
        # SequenceClassifierOutput(loss=tensor(1.6602, device='cuda:0', grad_fn=<NllLossBackward>), 
        # logits=tensor([[-0.0098,  0.0775,  0.0358,  0.0997, -0.0220],
        # [-0.0169,  0.1331,  0.0294,  0.1378,  0.0054]], device='cuda:0',
        # grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
   
        logits = outputs["logits"]
        scores = softmax(logits, dim =1)
        # numpy array of batch size 
        pred = torch.argmax(scores, dim =1).cpu().numpy().tolist()
        # appends the batch pred into an empty list 
        pred_list.append(pred)
    
    # flatten the nested list in pred_list
    flat_list = [item for sublist in pred_list for item in sublist]
    
    test_df["pred"]= pd.Series(flat_list)


In [None]:
test_df

In [None]:
y_true = test_df["class"]
y_pred = test_df["pred"]
{"accuracy": accuracy_score(y_true, y_pred), "f1":f1_score(y_true,y_pred, average = "macro"),"precision":precision_score(y_true,y_pred,average = "macro"), "recall":recall_score(y_true,y_pred,average = "macro")}

Without training the distilbert model on downstream task, the default distilbert model + classification head performs extremely poor. 

## Fine-tuning on custom data

In [None]:
train_df = read_data(train_file, lower_case = True)
labels = train_df[LABEL_COL]

In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_df[TEXT_COL], train_df[LABEL_COL], test_size=.2)

In [None]:
train_texts = train_texts.tolist()
val_texts = val_texts.tolist()
train_labels= train_labels.tolist()
val_labels= val_labels.tolist()

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels = 3)



In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

1) Tokenize the training and test text 

2) Turn the tokenized encodings and labels to Dataset obj in pytorch



In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
#test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
train_dataset = customDataset(train_encodings, train_labels)
val_dataset = customDataset(val_encodings, val_labels)

In [None]:
# training argument to be defined outside of class? 
# for together with arg_parse? 
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=50,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps = 20,
    no_cuda = True
    
)

In [None]:
trainer = Trainer(model = model,
                   args = training_args,
                   train_dataset = train_dataset, 
                   eval_dataset = val_dataset,
                    compute_metrics = compute_metrics)
trainer.train()


In [None]:
trainer.evaluate()

In [None]:
trainer.evaluate(train_dataset)

In [None]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/sentiment_exp_1')

In [None]:
# save model using trainer 
trainer.save_model("../models/distilbert/model_config")

In [None]:
# save the tokenizer (you did not expand the vocab)
tokenizer.save_pretrained('../models/distilbert/tokenizer_config')

In [None]:
trainer = Trainer(model = new_model,
                   args = training_args,
                   train_dataset = train_dataset, 
                   eval_dataset = val_dataset,
                    compute_metrics = compute_metrics)

In [None]:
trainer.evaluate()

## Running evaluation on Test dataset 

After we load a pre-trained model from our own directory or from huggingface, the model itself can be treated as a nn.module. 

This means that we can use it as it is in pytorch.
See fine-tuning in native pytorch to help you. 
https://huggingface.co/transformers/training.html

Models that are initialized are eval mode by default. We can then use pytorch's dataset and dataloader class to help us when we do evaluation.

In [None]:
import pandas as pd 
from transformers import DistilBertTokenizerFast, Trainer, TrainingArguments, DistilBertForSequenceClassification 
from sklearn.metrics import accuracy_score, f1_score,precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import softmax
import torch

In [None]:
train_df = read_data(train_file,lower_case= True)
labels = train_df[LABEL_COL].tolist()


In [None]:
test_tokenizer = DistilBertTokenizerFast.from_pretrained('../models/distilbert/tokenizer_config')
test_model= DistilBertForSequenceClassification.from_pretrained("../models/distilbert/model_config")


In [None]:
train_encodings = test_tokenizer(train_df[TEXT_COL].tolist(), truncation=True, padding=True)


In [None]:
from transformer_utils import customDataset

In [None]:
train_dataset = customDataset(train_encodings, labels)
dataloader = DataLoader(dataset = train_dataset, batch_size = 4, shuffle=True)


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")
device

In [None]:
test_model.to(device)

In [None]:
# set model.to(device)
# model output is logits 
# return df["score"]
# softmax all the logits to class
# return df["pred"]

test_model.eval()

# empty list for prediction to be appended into  
pred_list = []
with torch.no_grad():  # so will not update 
    
    # tqdm for progress bar
    loop = tqdm(enumerate(dataloader), total = len(dataloader),
                        leave = True)
    for _, data in loop:     
        input_ids = data["input_ids"].to(device)
        masks = data["attention_mask"].to(device)
        labels = data["labels"].to(device)
        
        outputs = test_model(input_ids, masks, labels = labels)
        # information about model outputs: https://huggingface.co/transformers/main_classes/output.html

        # sample output: 
        # SequenceClassifierOutput(loss=tensor(1.6602, device='cuda:0', grad_fn=<NllLossBackward>), 
        # logits=tensor([[-0.0098,  0.0775,  0.0358,  0.0997, -0.0220],
        # [-0.0169,  0.1331,  0.0294,  0.1378,  0.0054]], device='cuda:0',
        # grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
   
        logits = outputs["logits"]
        scores = softmax(logits, dim =1)
        # numpy array of batch size 
        pred = torch.argmax(scores, dim =1).cpu().numpy().tolist()
        # appends the batch pred into an empty list 
        pred_list.append(pred)
    
    # flatten the nested list in pred_list
    flat_list = [item for sublist in pred_list for item in sublist]
    
    train_df["pred"]= pd.Series(flat_list)


In [None]:
train_df

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall}

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
y_true = train_df["class"]
y_pred = train_df["pred"]
{"accuracy": accuracy_score(y_true, y_pred), "f1":f1_score(y_true,y_pred, average = "macro"),"precision":precision_score(y_true,y_pred,average = "macro"), "recall":recall_score(y_true,y_pred,average = "macro")}