## Fine Tune XLM-R 
Code-mixed Sentiment Classifiction of Sinhala-English Code-Mixed Data.

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
import torch
print("GPU Device name")
torch.cuda.get_device_name(0)

### **Parameters**

**User Parameters**

In [None]:
technique = "Hate-Speech" 
experiment_no = "1"
oversample_dataset = True 
over_sampling_technique = "ROS" 
sampling_strategy = "1:0.25:0.25" 
validation_size = (1/9)
test_size = 0.1
split_random_state = 42
training_seed = 42 #@param [8, 42,77]
NO_OUTPUT_LAYERS = 3
tag_set = ["Not offensive", "Hate-Inducing", "Abusive"]
script="Char-Script-1.0"

In [None]:
MAX_LEN = 128
BATCH_SIZE = 32
LEARNING_RATE = 5e-5
EPOCHS = 20

**Folder Paths**

In [None]:
dataset_path = "/kaggle/input/sinhala-english-cmcs-dataset/annotated-script(all).csv"
model_save_path = "/kaggle/working/"+technique+"/"+experiment_no

**Dependencies**

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets
# !pip install optuna

In [None]:
import re
import time
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, TrainingArguments, Trainer, AdamW, get_scheduler, EarlyStoppingCallback
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import io
import seaborn as sns
from datasets import load_metric
from collections import Counter
from transformers import  AdamW, get_linear_schedule_with_warmup,set_seed
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE
import matplotlib.pyplot as plt
%matplotlib inline


### **Oversampling**

In [None]:
def apply_oversampling(x, y):
  (unique, counts) = np.unique(y, axis=0, return_counts=True)
  print("Class Distribution Without Oversampling", counts)

  # define oversampling strategy
  if (over_sampling_technique == ""):
    return x, y
  elif (over_sampling_technique == "ROS"):
    if (technique=="Humor"):
      oversample = RandomOverSampler(sampling_strategy = float(sampling_strategy))
    elif (technique=="Hate-Speech"):
      sampling_ratio = sampling_strategy.split(":")
      oversample = RandomOverSampler(sampling_strategy = {
          0:int(counts[0]*float(sampling_ratio[0])), 
          1:int(counts[0]*float(sampling_ratio[1])), 
          2:int(counts[0]*float(sampling_ratio[2]))
          })
    elif (technique=="Sentiment"):
      sampling_ratio = sampling_strategy.split(":")
      oversample = RandomOverSampler(sampling_strategy = {
          0:int(counts[1]*float(sampling_ratio[0])), 
          1:int(counts[1]*float(sampling_ratio[1])), 
          2:int(counts[1]*float(sampling_ratio[2])),
          3:int(counts[1]*float(sampling_ratio[3]))
          })
  elif (over_sampling_technique == "ADASYN"):
    oversample = ADASYN(sampling_strategy="minority")
  elif (over_sampling_technique == "SMOTE"):
    oversample = SMOTE()
  elif (over_sampling_technique == "BorderlineSMOTE"):
    oversample = BorderlineSMOTE()

  # fit and apply the transform
  X_over, y_over = oversample.fit_resample(x, y)

  (unique, counts) = np.unique(y_over, axis=0, return_counts=True)
  print("Class Distribution After Oversampling", counts)

  return X_over, y_over

### **Load & Preprocess Dataset**

In [None]:
def preprocess_texts(sentences):
  sentences = [re.sub(r'http\S+','',s) for s in sentences]
  sentences = [s.replace('#','') for s in sentences]
  sentences = ["[CLS] " + s + " [SEP]" for s in sentences]
  return sentences

In [None]:
df = pd.read_csv(dataset_path)
df = df[['Sentence', technique, script]]
df.columns = ['Sentence', 'Label', script]

df['Label'], uniq = pd.factorize(df['Label'])

X, y = df[['Sentence', script]], df[['Label']]
stratifying_col = y["Label"]
X_rem, X_test, y_rem, y_test = train_test_split(X, y, test_size=test_size, stratify=stratifying_col, random_state=split_random_state)
stratifying_col = y_rem["Label"]
X_train, X_validation, y_train, y_validation = train_test_split(X_rem, y_rem, test_size=validation_size, stratify=stratifying_col, random_state=split_random_state)

In [None]:
del df, X, y, stratifying_col, X_rem, y_rem

In [None]:
print("X_train : Rows =", X_train.shape[0], ", Columns = ", X_train.shape[1])
print("y_train : Rows =", y_train.shape[0], ", Columns = ", y_train.shape[1])
print("X_validation : Rows =", X_validation.shape[0], ", Columns = ", X_validation.shape[1])
print("y_validation : Rows =", y_validation.shape[0], ", Columns = ", y_validation.shape[1])
print("X_test : Rows =", X_test.shape[0], ", Columns = ", X_test.shape[1])
print("y_test : Rows =", y_test.shape[0], ", Columns = ", y_test.shape[1])

In [None]:
# print("Labels :", ['Negative', 'Neutral', 'Positive', 'Conflict'])
print("Train :", y_train.groupby('Label').size().tolist())
print("Validation :", y_validation.groupby('Label').size().tolist())
print("Test :", y_test.groupby('Label').size().tolist())

In [None]:
def  oversampling(X_train,y_train):
  X_train = np.array(X_train).reshape(-1, 1)
  X_train, y_train = apply_oversampling(X_train, y_train)
  X_train = [x[0] for x in X_train.tolist()]
  return X_train,y_train

##### **Preprocess Data**

In [None]:
set_seed(training_seed)

In [None]:
X_train, y_train = X_train.values.tolist(), y_train.values.tolist()
X_validation, y_validation = X_validation.values.tolist(), y_validation.values.tolist()
X_test, y_test = X_test.values.tolist(), y_test.values.tolist()

In [None]:
X_train_arr=[]
y_train_arr=[]

for i in range(len(X_train)): 
    X_train_arr.append(X_train[i][0])
    y_train_arr.append(y_train[i][0])

    
X_validation_arr=[]
y_validation_arr=[]   
for i in range(len(X_validation)): 
    X_validation_arr.append(X_validation[i][0])
    y_validation_arr.append(y_validation[i][0])
    
    
X_test_arr=[]
y_test_arr=[]  
X_test_latin=[]
y_test_latin=[]

X_test_Sinhala=[]
y_test_Sinhala=[]

X_test_Mixed=[]
y_test_Mixed=[]
for i in range(len(X_test)): 
    X_test_arr.append(X_test[i][0])
    y_test_arr.append(y_test[i][0])
    
    if X_test[i][1]=="Latin":
        X_test_latin.append(X_test[i][0])
        y_test_latin.append(y_test[i][0])
        
    elif X_test[i][1]=="Sinhala":
        X_test_Sinhala.append(X_test[i][0])
        y_test_Sinhala.append(y_test[i][0])
        
    elif X_test[i][1]=="Mixed":
        X_test_Mixed.append(X_test[i][0])
        y_test_Mixed.append(y_test[i][0])

In [None]:
X_train_arr,y_train_arr=oversampling(X_train_arr,y_train_arr)

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base", do_lower_case=True) #######################################################uncased

In [None]:
encoded_X_train = tokenizer(X_train_arr, truncation=True, padding=True, max_length=MAX_LEN)
encoded_X_validation = tokenizer(X_validation_arr, truncation=True, padding=True, max_length=MAX_LEN)
encoded_X_test = tokenizer(X_test_arr, truncation=True, padding=True, max_length=MAX_LEN)
encoded_X_test_latin = tokenizer(X_test_latin, truncation=True, padding=True, max_length=MAX_LEN)
encoded_X_test_Sinhala = tokenizer(X_test_Sinhala, truncation=True, padding=True, max_length=MAX_LEN)
encoded_X_test_Mixed = tokenizer(X_test_Mixed, truncation=True, padding=True, max_length=MAX_LEN)



In [None]:
class DatasetObject(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DatasetObject(encoded_X_train, y_train_arr)
validation_dataset = DatasetObject(encoded_X_validation, y_validation_arr)
test_dataset = DatasetObject(encoded_X_test, y_test_arr)
test_dataset_latin = DatasetObject(encoded_X_test_latin, y_test_latin)
test_dataset_Sinhala = DatasetObject(encoded_X_test_Sinhala, y_test_Sinhala)
test_dataset_Mixed = DatasetObject(encoded_X_test_Mixed, y_test_Mixed)


In [None]:
train_sampler = RandomSampler(train_dataset)
# train_sampler = SequentialSampler(train_dataset)
validation_sampler = SequentialSampler(validation_dataset)
test_sampler = SequentialSampler(test_dataset)
validation_sampler_latin = SequentialSampler(test_dataset_latin)
validation_sampler_sinhala= SequentialSampler(test_dataset_Sinhala)
validation_sampler_mixed = SequentialSampler(test_dataset_Mixed)

train_loader = DataLoader(train_dataset, sampler=train_sampler , batch_size=BATCH_SIZE)
validation_loader = DataLoader(validation_dataset, sampler=validation_sampler , batch_size=BATCH_SIZE) 
test_loader = DataLoader(test_dataset, sampler=test_sampler , batch_size=BATCH_SIZE) 
test_loader_latin = DataLoader(test_dataset_latin, sampler=validation_sampler_latin , batch_size=BATCH_SIZE) 
test_loader_Sinhala = DataLoader(test_dataset_Sinhala, sampler=validation_sampler_sinhala , batch_size=BATCH_SIZE) 
test_loader_Mixed = DataLoader(test_dataset_Mixed, sampler=validation_sampler_mixed , batch_size=BATCH_SIZE) 

### **Fine-Tuning**

#### **Initialize the model**

In [None]:
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=NO_OUTPUT_LAYERS)
model.cuda()
print("Done")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
import gc
# del all_data, encoded_X_train, encoded_X_test, X_train, X_test, tokenizer, DatasetObject
torch.cuda.empty_cache()
gc.collect()

#### **Fine-tuning in PyTorch with the Trainer API**

##### **Fine-tune & Test the model**

In [None]:
# def compute_metrics(eval_pred):
#     metric1 = load_metric("precision")
#     metric2 = load_metric("recall")
#     metric3 = load_metric("f1")
#     metric4 = load_metric("accuracy")
    
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
#     recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
#     f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
#     accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]
#     macro_precision = metric1.compute(predictions=predictions, references=labels, average="macro")["precision"]
#     macro_recall = metric2.compute(predictions=predictions, references=labels, average="macro")["recall"]
#     macro_f1 = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
#     return {"accuracy":accuracy, "precision": precision, "recall": recall, "f1": f1, "macro_precision": macro_precision, "macro_recall": macro_recall, "macro_f1": macro_f1}

In [None]:
set_seed(training_seed)

In [None]:
# # Default Hyperparameters
# # training_args = TrainingArguments("test_trainer") 
# # learning_rate=5e-5, batch_size=8,  weight_decay=0, num_train_epochs=3

# training_args = TrainingArguments(
#     learning_rate = LEARNING_RATE,
#     per_device_train_batch_size = BATCH_SIZE,
#     per_device_eval_batch_size = BATCH_SIZE,
#     output_dir = model_save_path,
#     num_train_epochs = EPOCHS,
#     metric_for_best_model="eval_macro_f1",
#     load_best_model_at_end=True,
#     save_strategy="epoch",
#     evaluation_strategy="epoch",
#     save_total_limit=1,
#     warmup_steps=0
# )

In [None]:
# trainer = Trainer(
#     model = model, 
#     args = training_args, 
#     train_dataset = train_dataset,
#     eval_dataset = validation_dataset,
#     compute_metrics = compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
# )

**Train the model**

In [None]:
# trainer.train()

In [None]:
# trainer.evaluate()

In [None]:
# epoch_traces = []
# acc_traces = []
# validation_loss_traces = []

In [None]:
# for item  in trainer.state.log_history:
#   if (item.get("eval_loss") is not None) and (item.get("eval_macro_f1") is not None) and (item.get("epoch") not in epoch_traces):
#     validation_loss_traces.append(item.get("eval_loss"))
#     epoch_traces.append(item.get("epoch"))
#     acc_traces.append(item.get("eval_macro_f1"))

In [None]:
# plt.plot(range(len(epoch_traces)), acc_traces)
# plt.xlabel('Epoch')
# plt.ylabel('Macro F1-Score')
# plt.title('Epoch vs Validation Macro F1-Score')
# plt.xticks(range(len(epoch_traces)), epoch_traces)
# plt.show()

In [None]:
# plt.plot(range(len(epoch_traces)), validation_loss_traces)
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.title('Epoch vs Validation Loss')
# plt.xticks(range(len(epoch_traces)), epoch_traces)
# plt.show()

**Test the fine-tuned model**

In [None]:
# trainer = Trainer(
#     model = model, 
#     args = training_args, 
#     train_dataset = train_dataset,
#     eval_dataset = test_dataset,
#     compute_metrics = compute_metrics,
# )

# trainer.evaluate()

In [None]:
# trainer = Trainer(
#     model = model, 
#     args = training_args, 
#     train_dataset = train_dataset,
#     eval_dataset = test_dataset_latin,
#     compute_metrics = compute_metrics,
# )

# trainer.evaluate()

In [None]:
# trainer = Trainer(
#     model = model, 
#     args = training_args, 
#     train_dataset = train_dataset,
#     eval_dataset = test_dataset_Sinhala,
#     compute_metrics = compute_metrics,
# )

# trainer.evaluate()

In [None]:
# trainer = Trainer(
#     model = model, 
#     args = training_args, 
#     train_dataset = train_dataset,
#     eval_dataset = test_dataset_Mixed,
#     compute_metrics = compute_metrics,
# )

# trainer.evaluate()

#### **Fine tuning with native PyTorch**

In [None]:
def compute_metrics(allpreds,alllabels):
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("f1")
    metric4 = load_metric("accuracy")
    
    predictions, labels = allpreds,alllabels
    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]
    macro_precision = metric1.compute(predictions=predictions, references=labels, average="macro")["precision"]
    macro_recall = metric2.compute(predictions=predictions, references=labels, average="macro")["recall"]
    macro_f1 = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
    return {"accuracy":accuracy, "precision": precision, "recall": recall, "f1": f1, "macro_precision": macro_precision, "macro_recall": macro_recall, "macro_f1": macro_f1}


In [None]:
def evaluate(model, dataloader):
    model.eval()
    allpreds = []
    alllabels = []

    for step, batch in enumerate(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        allpreds.extend(torch.argmax(logits, dim=-1))
        alllabels.extend(batch["labels"])
    return compute_metrics(allpreds,alllabels)

In [None]:
def calculate_loss_and_f1(model, dataloader):
    model.eval()
    allpreds = []
    alllabels = []
    total_loss = 0

    for step, batch in enumerate(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        loss = outputs.loss
        total_loss += loss.item()
        allpreds.extend(torch.argmax(logits, dim=-1))
        alllabels.extend(batch["labels"])
            
    macro_f1 = load_metric("f1").compute(predictions=allpreds, references=alllabels, average="macro")["f1"]
    return macro_f1, (total_loss/len(dataloader))

In [None]:
no_decay = ['bias', 'LayerNorm.weight']
num_training_steps = EPOCHS * len(train_loader)
betas = (0.9, 0.999)
eps = 1e-08
num_warmup_steps = 0
param_optimizer = list(model.named_parameters())

optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE,betas=betas,eps=eps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
tot_loss = 0
log_loss = 0
best_val_acc = 0

tot_train_time = 0
pbar_update_freq = 10

glb_step = 0
actual_step = 0
max_grad_norm = 1.0
eval_every_steps = 100
gradient_accumulation_steps = 1

leave_training = False
val_metric = "macro_f1"

best_epoch = -1
early_stop_epoch_thresh = 5

epoch_traces = []
acc_traces = []
validation_loss_traces = []

In [None]:
pbar = tqdm(total=num_training_steps, desc="Train")
for epoch in range(EPOCHS):
    print(f"Begin Epoch {epoch}")
    epoch_start_time = time.time()
    for step, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss = loss / gradient_accumulation_steps
        loss.backward()
        tot_loss += loss.item()
        actual_step += 1

        if actual_step % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            glb_step += 1
            
            if glb_step % pbar_update_freq == 0:              
                aveloss = (tot_loss - log_loss)/pbar_update_freq
                pbar.update(pbar_update_freq)
                pbar.set_postfix({'Average Loss': aveloss, "Epoch": epoch})
                log_loss = tot_loss

            if optimizer is not None:
                optimizer.step()
                optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()

        if glb_step > num_training_steps:
            leave_training = True
            break
    
    val_acc, val_loss = calculate_loss_and_f1(model, validation_loader)
    epoch_traces.append(epoch)
    acc_traces.append(val_acc)
    validation_loss_traces.append(val_loss)
    print("Validation: [Epoch: {}, Macro F1: {}, Validation Loss: {}, Time per Epoch: {}]".format(epoch, val_acc, val_loss, time.time()-epoch_start_time), flush=True)

    if val_acc > best_val_acc:
        torch.save(model.state_dict(),f"best_model.ckpt")
        best_val_acc = val_acc
        best_epoch = epoch
        
    elif (epoch - best_epoch) >= early_stop_epoch_thresh:
        print("Training stopped early at Epoch: %d" % epoch)
        break  # Terminate the training loop

    if leave_training:
        break

In [None]:
model.load_state_dict(torch.load(f"best_model.ckpt"))
model.cuda()
print("Done")

_Validate the Model_


In [None]:
evaluate(model, validation_loader)

In [None]:
plt.plot(range(len(epoch_traces)), acc_traces)
plt.xlabel('Epoch')
plt.ylabel('Macro F1-Score')
plt.title('Epoch vs Validation Macro F1-Score')
plt.xticks(range(len(epoch_traces)), epoch_traces)
plt.show()

In [None]:
plt.plot(range(len(epoch_traces)), validation_loss_traces)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Epoch vs Validation Loss')
plt.xticks(range(len(epoch_traces)), epoch_traces)
plt.show()

## Test


In [None]:
evaluate(model, test_loader)

**Script-Wise Evaluation**


In [None]:
evaluate(model, test_loader_latin)

In [None]:
evaluate(model, test_loader_Sinhala)

In [None]:
evaluate(model, test_loader_Mixed)