# Imports

In [65]:
# imports
import pandas as pd
import numpy as np
# import matplotlib as plt
import random as rn
import os
os.environ['PYTHONHASHSEED'] = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = ''
np.random.seed(37)
rn.seed(1254)

# Load data, train, test, validation splits

In [67]:
# EDA
path_to_data = "./data/Sentences_200.csv"
new_data_5_cat = pd.read_csv(path_to_data, index_col='S.No.')
print(type(new_data_5_cat))
display(new_data_5_cat.head())
display(new_data_5_cat.describe())
display(new_data_5_cat.shape)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,Sentence,Label
S.No.,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Introduction to Quantum Mechanics,1.0
2,"In this chapter, we explore the foundational p...",0.0
3,The Rise and Fall of Civilizations,1.0
4,Historical records reveal the complex trajecto...,0.0
5,Part III: Advanced Mathematical Concepts,1.0


Unnamed: 0,Label
count,198.0
mean,0.555051
std,0.31377
min,0.0
25%,0.3
50%,0.65
75%,0.8
max,1.0


(198, 2)

In [68]:
# Make test, train, cv splits
from datasets import Dataset
ds = Dataset.from_pandas(new_data_5_cat)

ds_train_temp_dict = ds.train_test_split(train_size=160)
ds_train = ds_train_temp_dict['train']
ds_test_cv_dict = ds_train_temp_dict['test'].train_test_split(test_size=20)
ds_cv = ds_test_cv_dict['train']
ds_test = ds_test_cv_dict['test']
display(ds_train)
display(ds_test)
display(ds_cv)

Dataset({
    features: ['Sentence', 'Label', 'S.No.'],
    num_rows: 160
})

Dataset({
    features: ['Sentence', 'Label', 'S.No.'],
    num_rows: 20
})

Dataset({
    features: ['Sentence', 'Label', 'S.No.'],
    num_rows: 18
})

# Fine tune LLM

In [69]:
# Get Tokenizer
from transformers import AutoTokenizer
model_nm = 'microsoft/deberta-v3-small'
tokz = AutoTokenizer.from_pretrained(model_nm)
tokz.tokenize('My name is Geetansh Bhardwaj.')



['▁My', '▁name', '▁is', '▁Geeta', 'n', 'sh', '▁Bhardwaj', '.']

In [70]:
# Tokenize the 'Sentence' column
def tokenize_string(row):
    return tokz(row['Sentence'])

def tokenize_sentence_col(ds):
    '''
    We will tokenize the 'Sentence' column and add another column 'Sentence_id'. It will be used for fine-tuning
    ds: a dataset with 'Sentence' column
    '''

    tokenized_ds = ds.map(tokenize_string, batch_size=5)
    return tokenized_ds

tokenized_ds_train = tokenize_sentence_col(ds_train)

Map: 100%|██████████| 160/160 [00:00<00:00, 3348.83 examples/s]


In [71]:
# An undocumented fact: Transformers assume that your label column is named "labels". Ours is named "Label", so we will change that
tokenized_ds_train = tokenized_ds_train.rename_columns({'Label' : 'labels'})
tokenized_ds_train

tokenized_ds_cv = tokenize_sentence_col(ds_cv)
tokenized_ds_cv = tokenized_ds_cv.rename_columns({'Label' : 'labels'})

Map: 100%|██████████| 18/18 [00:00<00:00, 1504.20 examples/s]


In [72]:
# Get the model (We are actually using a pre-trained one)
from transformers import AutoModelForSequenceClassification
my_model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [73]:
from transformers import TrainingArguments, Trainer
bs = 5
epochs = 4
lr = 8e-5
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')
trainer = Trainer(my_model, args, train_dataset=tokenized_ds_train, eval_dataset=tokenized_ds_cv,
                  tokenizer=tokz)

  trainer = Trainer(my_model, args, train_dataset=tokenized_ds_train, eval_dataset=tokenized_ds_cv,


In [74]:
# Train (Here, fine tune) the model
trainer.train()

  0%|          | 0/16 [1:22:50<?, ?it/s]
 25%|██▌       | 32/128 [01:00<02:37,  1.64s/it]
 25%|██▌       | 32/128 [01:01<02:37,  1.64s/it]

{'eval_loss': 0.13210749626159668, 'eval_runtime': 0.5116, 'eval_samples_per_second': 35.182, 'eval_steps_per_second': 3.909, 'epoch': 1.0}


 50%|█████     | 64/128 [01:55<01:58,  1.86s/it]
 50%|█████     | 64/128 [01:56<01:58,  1.86s/it]

{'eval_loss': 0.025790058076381683, 'eval_runtime': 0.5595, 'eval_samples_per_second': 32.171, 'eval_steps_per_second': 3.575, 'epoch': 2.0}


 75%|███████▌  | 96/128 [02:52<00:54,  1.70s/it]
 75%|███████▌  | 96/128 [02:52<00:54,  1.70s/it]

{'eval_loss': 0.03409378230571747, 'eval_runtime': 0.6622, 'eval_samples_per_second': 27.181, 'eval_steps_per_second': 3.02, 'epoch': 3.0}


100%|██████████| 128/128 [03:54<00:00,  1.87s/it]
100%|██████████| 128/128 [03:58<00:00,  1.86s/it]

{'eval_loss': 0.024491995573043823, 'eval_runtime': 0.543, 'eval_samples_per_second': 33.147, 'eval_steps_per_second': 3.683, 'epoch': 4.0}
{'train_runtime': 238.5125, 'train_samples_per_second': 2.683, 'train_steps_per_second': 0.537, 'train_loss': 0.09053848683834076, 'epoch': 4.0}





TrainOutput(global_step=128, training_loss=0.09053848683834076, metrics={'train_runtime': 238.5125, 'train_samples_per_second': 2.683, 'train_steps_per_second': 0.537, 'total_flos': 1818871829700.0, 'train_loss': 0.09053848683834076, 'epoch': 4.0})

In [75]:
# Report loss for your model using the test set
tokenized_ds_test = tokenize_sentence_col(ds_test)
tokenized_ds_test = tokenized_ds_test.rename_columns({'Label' : 'labels'})

preds = trainer.predict(tokenized_ds_test).predictions.astype(float)
preds

Map: 100%|██████████| 20/20 [00:00<00:00, 50.43 examples/s]
100%|██████████| 2/2 [00:00<00:00, 13.74it/s]


array([0.85534549, 0.31081381, 0.90419859, 0.87101161, 0.78344548,
       0.30044168, 0.93448901, 0.90961564, 0.58258021, 0.93629748,
       0.91476035, 0.34552005, 0.77351129, 0.48210973, 0.433981  ,
       0.27944249, 0.89211512, 0.2244986 , 0.25287008, 0.07797185])

In [77]:
# Using MAE to calculate loss
def get_mae(preds, real):
    '''
    preds, real: array 
    '''

    mae = np.mean(np.abs(preds - real))
    return mae

real = np.array(tokenized_ds_test['labels'])

print(f"MAE: {get_mae(preds, real)}")

# Print predictions on test side-by-side
m = pd.DataFrame({'a':real.reshape(20,), 'b':preds.reshape(20)})
m

MAE: 0.10661058641970159


Unnamed: 0,a,b
0,0.85,0.855345
1,0.4,0.310814
2,0.8,0.904199
3,0.85,0.871012
4,0.7,0.783445
5,0.3,0.300442
6,0.75,0.934489
7,0.85,0.909616
8,0.7,0.58258
9,0.9,0.936297


In [None]:
# MAE of my model: 0.1 (Based on test set)

# Check if your GPU is available

In [79]:
import torch
torch.cuda.is_available()

False