## Training for distilbert transformer
Follow this tutorial :https://huggingface.co/transformers/custom_datasets.html

In [15]:
import pandas as pd 
from transformers import DistilBertTokenizerFast, Trainer, TrainingArguments, DistilBertForSequenceClassification 
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score
from torch.nn.functional import softmax
import torch
from tqdm import tqdm


In [11]:
train_file = "../data/label_data.csv"
test_file = "../data/label_data.csv"
LABEL_COL = "class"
TEXT_COL = "comment"

In [3]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [4]:
# model_name = "distilbert-base-uncased-finetuned-sst-2-english"
# pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
# pt_model.save_pretrained("../models/distilbert/model_config")

In [6]:
# tokenizer.save_pretrained('../models/distilbert/tokenizer_config')

## Three utilities functions for transformers

In [6]:
def read_data(fname:str, lower_case: bool=False) ->pd.DataFrame:
        """
        This function will read the textfiles.

        fname will be out of new_train_data.csv, unlabeled_data.csv and test_data.txt

        """
        try:
            df = pd.read_csv(fname, encoding = "UTF-8", usecols = ["class","comment"])
            df[LABEL_COL]= df[LABEL_COL].replace({"negative":0, "neutral":1, "positive":2})
            if lower_case:
                df[TEXT_COL]= df[TEXT_COL].str.lower()

            return df
        except (FileNotFoundError,PermissionError):

            print("No files found. Check the data directory for files.")

In [4]:
from transformer_utils import customDataset

In [9]:
# class customDataset(Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels
        
#     # allows us to select examples through indexing 
#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(int(self.labels[idx]))
#         return item

#     def __len__(self):
#         return len(self.labels)

In [5]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall}

In [13]:
def split_train_eval(df:pd.DataFrame) -> dict:
        # splits to train, val text and labels
        
        # currently all are pd.Series
        train_df,val_df,train_label,val_label = train_test_split(df["Truth"],
                                                                 df["class"], 
                                                                 random_state = 42, 
                                                                 test_size = 0.2)
        
        # change all to lists, as inputs to tokenizer has to be 
        # text input must of type `str` (single example), 
        #`List[str]` (batch or single pretokenized example) or 
        #`List[List[str]]` (batch of pretokenized examples).
        
        train_list = train_df.tolist()
        val_list = val_df.tolist()
        label_list = train_label.tolist()
        val_label_list = val_label.tolist()
        
        return {"list of training examples":train_list,
                "list of val examples":val_list,
                "list of training labels":label_list,
                "list of val labels ":val_label_list}

## Load pretrained models and tokenizer

In [18]:
test_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels = 3)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [22]:
test_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [24]:
test_df = read_data(test_file, lower_case = True)
labels = test_df[LABEL_COL]

In [23]:
test_encodings = test_tokenizer(test_df[TEXT_COL].tolist(), truncation=True, padding=True)


When you call distilbert for sequence classification, you create an additional head on top of the distilbert model. 
If you don't do any finetuning, the weights of the additional head you create is randomize. THis means that it will perform terribly on our data. 
We will check the metrics of distilbert that is not fine-tuned and see how it performs. 

In [25]:
test_dataset = customDataset(test_encodings, labels)
dataloader = DataLoader(dataset = test_dataset, batch_size = 4, shuffle=True)


In [26]:
device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")
device

device(type='cuda', index=0)

In [27]:
test_model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [29]:
# set model.to(device)
# model output is logits 
# return df["score"]
# softmax all the logits to class
# return df["pred"]

test_model.eval()

# empty list for prediction to be appended into  
pred_list = []
with torch.no_grad():  # so will not update 
    
    # tqdm for progress bar
    loop = tqdm(enumerate(dataloader), total = len(dataloader),
                        leave = True)
    for _, data in loop:     
        input_ids = data["input_ids"].to(device)
        masks = data["attention_mask"].to(device)
        labels = data["labels"].to(device)
        
        outputs = test_model(input_ids, masks, labels = labels)
        # information about model outputs: https://huggingface.co/transformers/main_classes/output.html

        # sample output: 
        # SequenceClassifierOutput(loss=tensor(1.6602, device='cuda:0', grad_fn=<NllLossBackward>), 
        # logits=tensor([[-0.0098,  0.0775,  0.0358,  0.0997, -0.0220],
        # [-0.0169,  0.1331,  0.0294,  0.1378,  0.0054]], device='cuda:0',
        # grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
   
        logits = outputs["logits"]
        scores = softmax(logits, dim =1)
        # numpy array of batch size 
        pred = torch.argmax(scores, dim =1).cpu().numpy().tolist()
        # appends the batch pred into an empty list 
        pred_list.append(pred)
    
    # flatten the nested list in pred_list
    flat_list = [item for sublist in pred_list for item in sublist]
    
    test_df["pred"]= pd.Series(flat_list)


100%|██████████| 150/150 [00:11<00:00, 13.15it/s]


In [30]:
test_df

Unnamed: 0,class,comment,pred
0,2,this course able tp educate people how to mana...,0
1,1,"can understand the course, by using tablet to ...",0
2,2,the real wealth is knowledge . thanks for prov...,0
3,2,this training was so good to me because i'm ju...,0
4,1,thank you,0
...,...,...,...
595,2,trainer of the month award,0
596,0,mr jacob coaching is not bad and he has demons...,2
597,2,instructor.hes good and guide us throughout.,0
598,2,cooperation n understanding btw assesser n lec...,0


In [31]:
y_true = test_df["class"]
y_pred = test_df["pred"]
{"accuracy": accuracy_score(y_true, y_pred), "f1":f1_score(y_true,y_pred, average = "macro"),"precision":precision_score(y_true,y_pred,average = "macro"), "recall":recall_score(y_true,y_pred,average = "macro")}

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.14833333333333334,
 'f1': 0.09819764349111808,
 'precision': 0.3066666666666667,
 'recall': 0.33371996193195336}

In [33]:
600 * 0.8

480.0

Without training the distilbert model on downstream task, the default distilbert model + classification head performs extremely poor. 

In [16]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_df[TEXT_COL], train_df[LABEL_COL], test_size=.2)

In [17]:
train_texts = train_texts.tolist()
val_texts = val_texts.tolist()
train_labels= train_labels.tolist()
val_labels= val_labels.tolist()

In [18]:
#test_df = read_data(test_file, lower_case = True)
# test_texts = test_df[TEXT_COL].tolist()
# test_labels = test_df[LABEL_COL].tolist()

1) Tokenize the training and test text 

2) Turn the tokenized encodings and labels to Dataset obj in pytorch



In [19]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
#test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [20]:
#test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [21]:
train_dataset = customDataset(train_encodings, train_labels)
val_dataset = customDataset(val_encodings, val_labels)

In [22]:
#test_dataset = customDataset(test_encodings, test_labels)

In [23]:
# training argument to be defined outside of class? 
# for together with arg_parse? 
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=50,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps = 20,
    no_cuda = True
    
)

In [24]:
trainer = Trainer(model = model,
                   args = training_args,
                   train_dataset = train_dataset, 
                   eval_dataset = val_dataset,
                    compute_metrics = compute_metrics)
trainer.train()


Step,Training Loss


TrainOutput(global_step=300, training_loss=0.22669504801432291)

In [25]:
trainer.evaluate()

{'eval_loss': 0.45277583599090576,
 'eval_accuracy': 0.9,
 'eval_f1': 0.785939998720655,
 'eval_precision': 0.777876984126984,
 'eval_recall': 0.7947660586835844,
 'epoch': 10.0}

In [71]:
trainer.evaluate(train_dataset)

{'eval_loss': 0.10230248421430588,
 'eval_accuracy': 0.9766666666666667,
 'eval_f1': 0.9565388268563169,
 'eval_precision': 0.9546871199978667,
 'eval_recall': 0.958421905248528}

In [1]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/sentiment_exp_1')

In [34]:
# save model using trainer 
trainer.save_model("../models/distilbert/model_config")

In [35]:
# save the tokenizer (you did not expand the vocab)
tokenizer.save_pretrained('../models/distilbert/tokenizer_config')

('../models/distilbert/tokenizer_config\\tokenizer_config.json',
 '../models/distilbert/tokenizer_config\\special_tokens_map.json',
 '../models/distilbert/tokenizer_config\\vocab.txt',
 '../models/distilbert/tokenizer_config\\added_tokens.json')

In [68]:
trainer = Trainer(model = new_model,
                   args = training_args,
                   train_dataset = train_dataset, 
                   eval_dataset = val_dataset,
                    compute_metrics = compute_metrics)

In [69]:
trainer.evaluate()

{'eval_loss': 0.45277583599090576,
 'eval_accuracy': 0.9,
 'eval_f1': 0.785939998720655,
 'eval_precision': 0.777876984126984,
 'eval_recall': 0.7947660586835844}

## Running evaluation on Test dataset 

After we load a pre-trained model from our own directory or from huggingface, the model itself can be treated as a nn.module. 

This means that we can use it as it is in pytorch.
See fine-tuning in native pytorch to help you. 
https://huggingface.co/transformers/training.html

Models that are initialized are eval mode by default. We can then use pytorch's dataset and dataloader class to help us when we do evaluation.

In [None]:
import pandas as pd 
from transformers import DistilBertTokenizerFast, Trainer, TrainingArguments, DistilBertForSequenceClassification 
from sklearn.metrics import accuracy_score, f1_score,precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import softmax
import torch

In [46]:
train_df = read_data(train_file,lower_case= True)
labels = train_df[LABEL_COL].tolist()


In [39]:
test_tokenizer = DistilBertTokenizerFast.from_pretrained('../models/distilbert/tokenizer_config')
test_model= DistilBertForSequenceClassification.from_pretrained("../models/distilbert/model_config")


In [42]:
train_encodings = test_tokenizer(train_df[TEXT_COL].tolist(), truncation=True, padding=True)


In [43]:
from transformer_utils import customDataset

In [47]:
train_dataset = customDataset(train_encodings, labels)
dataloader = DataLoader(dataset = train_dataset, batch_size = 4, shuffle=True)


In [48]:
device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")
device

device(type='cuda', index=0)

In [49]:
test_model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [51]:
# set model.to(device)
# model output is logits 
# return df["score"]
# softmax all the logits to class
# return df["pred"]

test_model.eval()

# empty list for prediction to be appended into  
pred_list = []
with torch.no_grad():  # so will not update 
    
    # tqdm for progress bar
    loop = tqdm(enumerate(dataloader), total = len(dataloader),
                        leave = True)
    for _, data in loop:     
        input_ids = data["input_ids"].to(device)
        masks = data["attention_mask"].to(device)
        labels = data["labels"].to(device)
        
        outputs = test_model(input_ids, masks, labels = labels)
        # information about model outputs: https://huggingface.co/transformers/main_classes/output.html

        # sample output: 
        # SequenceClassifierOutput(loss=tensor(1.6602, device='cuda:0', grad_fn=<NllLossBackward>), 
        # logits=tensor([[-0.0098,  0.0775,  0.0358,  0.0997, -0.0220],
        # [-0.0169,  0.1331,  0.0294,  0.1378,  0.0054]], device='cuda:0',
        # grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
   
        logits = outputs["logits"]
        scores = softmax(logits, dim =1)
        # numpy array of batch size 
        pred = torch.argmax(scores, dim =1).cpu().numpy().tolist()
        # appends the batch pred into an empty list 
        pred_list.append(pred)
    
    # flatten the nested list in pred_list
    flat_list = [item for sublist in pred_list for item in sublist]
    
    train_df["pred"]= pd.Series(flat_list)


100%|██████████| 150/150 [00:11<00:00, 13.19it/s]


In [60]:
train_df

Unnamed: 0,class,comment,pred
0,2,this course able tp educate people how to mana...,2
1,1,"can understand the course, by using tablet to ...",2
2,2,the real wealth is knowledge . thanks for prov...,2
3,2,this training was so good to me because i'm ju...,0
4,1,thank you,2
...,...,...,...
595,2,trainer of the month award,2
596,0,mr jacob coaching is not bad and he has demons...,2
597,2,instructor.hes good and guide us throughout.,1
598,2,cooperation n understanding btw assesser n lec...,0


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall}

In [59]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [65]:
y_true = train_df["class"]
y_pred = train_df["pred"]
{"accuracy": accuracy_score(y_true, y_pred), "f1":f1_score(y_true,y_pred, average = "macro"),"precision":precision_score(y_true,y_pred,average = "macro"), "recall":recall_score(y_true,y_pred,average = "macro")}

{'accuracy': 0.6183333333333333,
 'f1': 0.308260177714281,
 'precision': 0.30837949179588947,
 'recall': 0.30814364316926884}