# Fine-tuning DistilBERT

I going to try fine-tuning DistilBERT. 

In [1]:
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from datasets import Dataset, DatasetDict
import re
import os
os.environ["WANDB_DISABLED"] = "true"
tqdm.pandas()
device = "cuda:0" if torch.cuda.is_available() else "cpu"

2024-02-04 06:16:50.495144: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-04 06:16:50.495248: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-04 06:16:50.611207: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Getting the data
training_data = pd.read_csv('../input/training-llm-competition/train.csv')
valid_data = pd.read_csv('../input/training-llm-competition/validation.csv')

In [4]:
# Preprocessing
def preprocess(essay:str):
    preprocessed_essay = essay.lower()
    
    # Subbing out \n and \t
    preprocessed_essay = re.sub("\n","",preprocessed_essay)
    preprocessed_essay = re.sub("\t","",preprocessed_essay)

    # Replacing /xa0 = non-breaking space in Latin1
    preprocessed_essay = preprocessed_essay.replace(u'\xa0', u' ')
    
    return preprocessed_essay
training_data['essay'] = training_data['essay'].progress_apply(preprocess)
valid_data['essay'] = valid_data['essay'].progress_apply(preprocess)

100%|██████████| 44733/44733 [00:00<00:00, 62166.40it/s]
100%|██████████| 5195/5195 [00:00<00:00, 15611.55it/s]


In [5]:
# Renaming the columns
training_data.rename(columns={'essay':'text','LLM_written':'labels'},inplace=True)
valid_data.rename(columns={'essay':'text','LLM_written':'labels'},inplace=True)

# Changing labels to float
training_data['labels'] = training_data['labels'].astype('float')
valid_data['labels'] = valid_data['labels'].astype('float')

In [6]:
# Putting training data into a Dataset for Hugging Face
training = Dataset.from_pandas(training_data[['text','labels']])
validation = Dataset.from_pandas(valid_data[['text','labels']])

In [7]:
# Getting the tokenizer and model, using RoBERTa Base
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=1)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [8]:
# Getting the metric
roc_auc = evaluate.load('roc_auc')

# Function to compute metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = nn.functional.sigmoid(torch.from_numpy(logits)).numpy(force=True)
    return roc_auc.compute(references=labels,prediction_scores=predictions)

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

In [9]:
# Function for tokenizing
def tokenize_function(example):
    return tokenizer(example['text'],padding='max_length',truncation=True,max_length=512)

In [10]:
# Tokenizing
tokenized_data_training = training.map(tokenize_function,batched=True,batch_size=24)
tokenized_data_valid = validation.map(tokenize_function,batched=True,batch_size=24)

  0%|          | 0/1864 [00:00<?, ?ba/s]

  0%|          | 0/217 [00:00<?, ?ba/s]

In [11]:
# Training arguments
training_args = TrainingArguments(output_dir='training',evaluation_strategy='epoch',learning_rate=2e-5,weight_decay=0,seed=42,
                                  num_train_epochs=3,metric_for_best_model='eval_roc_auc',greater_is_better=True,load_best_model_at_end=True,
                                 save_strategy='epoch',logging_strategy='epoch',save_total_limit=1)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [12]:
# Setting up the trainer
trainer = Trainer(model,args=training_args,train_dataset=tokenized_data_training,eval_dataset=tokenized_data_valid
                  ,compute_metrics=compute_metrics)

In [13]:
# Training
trainer.train()



Epoch,Training Loss,Validation Loss,Roc Auc
1,0.0087,0.249233,0.971075
2,0.0018,0.34909,0.93181
3,0.0006,0.368581,0.742108




TrainOutput(global_step=8388, training_loss=0.0037086381477916246, metrics={'train_runtime': 3840.6946, 'train_samples_per_second': 34.941, 'train_steps_per_second': 2.184, 'total_flos': 1.7776675404813312e+16, 'train_loss': 0.0037086381477916246, 'epoch': 3.0})

In [14]:
# Saving the model
trainer.save_model('fine-tuned-distillBert')

In [15]:
# Zipping the model
!zip -r distill-bert.zip /kaggle/working/fine-tuned-distillBert

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/fine-tuned-distillBert/ (stored 0%)
  adding: kaggle/working/fine-tuned-distillBert/training_args.bin (deflated 51%)
  adding: kaggle/working/fine-tuned-distillBert/config.json (deflated 46%)
  adding: kaggle/working/fine-tuned-distillBert/model.safetensors (deflated 8%)


In [16]:
# Getting the best model
model = AutoModelForSequenceClassification.from_pretrained('../working/fine-tuned-distillBert')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [17]:
# Defining a function for inference
def inference(essay:str) -> float:
    # Tokenizing the input essay
    inputs = tokenizer(essay,padding='max_length',truncation=True,max_length=512,return_tensors='pt').to(device)
    
    # Getting the logits
    with torch.no_grad():
        logits = model(**inputs).logits
        probability = nn.functional.sigmoid(logits)
    return probability

In [18]:
# Running the examples through the model
train_predictions = training_data['text'].progress_apply(inference)
valid_predictions = valid_data['text'].progress_apply(inference)

100%|██████████| 44733/44733 [12:46<00:00, 58.35it/s]
100%|██████████| 5195/5195 [01:31<00:00, 57.08it/s]


In [19]:
# Making predictions
print('Predictions for Fine-tuned DistilBERT')
train_score = roc_auc_score(training_data['labels'],train_predictions)
valid_score = roc_auc_score(valid_data['labels'],valid_predictions)
print(f'Training ROC AUC: {train_score}')
print(f'Validation ROC AUC: {valid_score}')

Predictions for Fine-tuned DistilBERT
Training ROC AUC: 0.9999537707382391
Validation ROC AUC: 0.9710747774141688
