<a href="https://colab.research.google.com/github/MEHARKhaoula/transformers-text-coherence/blob/main/Fine_tuning_DistilBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!nvidia-smi

Wed Jan  4 09:51:58 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    28W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install sentencepiece transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m94.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m 

In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

# Dataset

In [5]:
df = pd.read_csv('/content/drive/My Drive/GCDC_train.csv')

df_test = pd.read_csv('/content/drive/My Drive/GCDC_test.csv')

In [6]:
from sklearn.utils import shuffle
df = shuffle(df)
df['labelA']= df['labelA'].astype(int)
df['labelA'] =df['labelA'] - 1
df_test['labelA']= df_test['labelA'].astype(int)
df_test['labelA'] =df_test['labelA'] - 1
print(df['labelA'])

3212    2
3245    0
1254    0
526     1
17      1
       ..
1844    0
2736    2
3039    0
2537    1
1468    1
Name: labelA, Length: 4000, dtype: int64


In [7]:
text = df.text.values
labels = df.labelA.values

text_eval = df_test.text.values
labels_eval = df_test.labelA.values


# Preprocessing

In [8]:
tokenizer = DistilBertTokenizer.from_pretrained(
    'distilbert-base-cased',
    do_lower_case = True
    )

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [9]:
token_id = []
attention_masks = []
token_id_eval = []
attention_masks_eval = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 512,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)


for sample in text_eval:
  encoding_dict_eval = preprocessing(sample, tokenizer)
  token_id_eval.append(encoding_dict_eval['input_ids']) 
  attention_masks_eval.append(encoding_dict_eval['attention_mask'])


token_id_eval = torch.cat(token_id_eval, dim = 0)
attention_masks_eval = torch.cat(attention_masks_eval, dim = 0)
labels_eval = torch.tensor(labels_eval)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# Data split

In [10]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 8

# Indices of the train and validation splits stratified by labels


# Train and validation sets
train_set = TensorDataset(token_id, 
                          attention_masks, 
                          labels)

val_set = TensorDataset(token_id_eval, 
                        attention_masks_eval, 
                        labels_eval)

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

# Train

In [11]:
# Load the DistilBertForSequenceClassification model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased',
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf


optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 2e-5,
                              eps = 1e-08
                              )


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
model = model.to(device)

Downloading:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier

In [12]:
from sklearn import metrics
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    acc = 0
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             
                             attention_mask = b_input_mask, 
                             labels = b_labels)  
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
       
        
    
    # ========== Validation ==========
    nb_tr_eval = 0
    # Set model to evaluation mode
    model.eval()

    
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                           
                              attention_mask = b_input_mask)
        b_labels = b_labels.cpu().detach().numpy()
        logits = eval_output.logits.detach().cpu().numpy()
        preds = np.argmax(logits, axis = 1).flatten()
        b_labels= b_labels.flatten()

        accuracy = metrics.accuracy_score(b_labels,preds )
        acc += accuracy
        nb_tr_eval += 1

    
    print('\n\t - Accuracy: {:.4f}'.format(acc / len(validation_dataloader)))
    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))

Epoch:  25%|██▌       | 1/4 [03:32<10:36, 212.19s/it]


	 - Accuracy: 0.5675

	 - Train loss: 0.9730


Epoch:  50%|█████     | 2/4 [07:11<07:12, 216.36s/it]


	 - Accuracy: 0.5763

	 - Train loss: 0.8462


Epoch:  75%|███████▌  | 3/4 [10:50<03:37, 217.74s/it]


	 - Accuracy: 0.5613

	 - Train loss: 0.6491


Epoch: 100%|██████████| 4/4 [14:29<00:00, 217.49s/it]


	 - Accuracy: 0.5050

	 - Train loss: 0.4020



