<a href="https://colab.research.google.com/github/HimashiRathnayake/CMCS-Text-Classification/blob/main/Basic_Fine_Tuning/XLM_R_Sentence_Level_Tasks_Basic_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fine Tune XLM-R 
Humor Detection & Hate speech Detection of Sinhala-English Code-Mixed Data.

In [None]:
import torch
print("GPU Device name")
torch.cuda.get_device_name(0)

GPU Device name


'Tesla K80'

### **Parameters**

**User Parameters**

In [None]:
technique = "sentiment" #@param ["humor", "hate speech", "sentiment"]
experiment_no = "1" #@param [] {allow-input: true}
oversample_dataset = False #@param {type:"boolean"}
over_sampling_technique = "ROS" #@param ["", "ROS","ADASYN", "SMOTE", "BorderlineSMOTE"]
sampling_strategy = "1:0.25:0.25" #@param [] {allow-input: true}
random_state = 41 #@param

if technique == "humor" :
  NO_OUTPUT_LAYERS = 2
  tag_set = ["Humorous", "Non-Humorous"]
elif technique == "hate speech":
  NO_OUTPUT_LAYERS = 3
  tag_set = ["Abusive", "Hate-Inducing", "Not offensive"]
else:
  NO_OUTPUT_LAYERS = 4
  tag_set = ["Positive", "Negative", "Neutral", "Conflict"]

In [None]:
MAX_LEN = 128
BATCH_SIZE = 32
LEARNING_RATE = 5e-5
EPOCHS = 6

**Folder Paths**

In [None]:
dataset_path = "/content/drive/Shareddrives/FYP/corpus/çompleted_draft.csv"
model_save_path = "/content/drive/Shareddrives/FYP/Humor_HateSpeech_detection/XLMR/"+technique+"/"+experiment_no

**Dependencies**

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets
# !pip install optuna



In [None]:
import re
import time
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, TrainingArguments, Trainer, AdamW, get_scheduler, EarlyStoppingCallback
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import drive
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
import io
import seaborn as sns
from datasets import load_metric
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Oversampling**

In [None]:
def apply_oversampling(x, y):

  (unique, counts) = np.unique(y, axis=0, return_counts=True)
  print("Class Distribution Without Oversampling", counts)

  # define oversampling strategy
  if (over_sampling_technique == ""):
    return x, y
  elif (over_sampling_technique == "ROS"):
    if (technique=="humor"):
      oversample = RandomOverSampler(sampling_strategy = float(sampling_strategy))
    else:
      sampling_ratio = sampling_strategy.split(":");
      oversample = RandomOverSampler(sampling_strategy = {
          0:int(counts[0]*float(sampling_ratio[0])), 
          1:int(counts[0]*float(sampling_ratio[1])), 
          2:int(counts[0]*float(sampling_ratio[2]))
          })
  elif (over_sampling_technique == "ADASYN"):
    oversample = ADASYN(sampling_strategy="minority")
  elif (over_sampling_technique == "SMOTE"):
    oversample = SMOTE()
  elif (over_sampling_technique == "BorderlineSMOTE"):
    oversample = BorderlineSMOTE()

  # fit and apply the transform
  X_over, y_over = oversample.fit_resample(x, y)

  (unique, counts) = np.unique(y_over, axis=0, return_counts=True)
  print("Class Distribution After Oversampling", counts)

  return X_over, y_over

### **Load & Preprocess Dataset**

In [None]:
def preprocess_texts(sentences):
  sentences = [re.sub(r'http\S+','',s) for s in sentences]
  sentences = [s.replace('#','') for s in sentences]
  sentences = ["[CLS] " + s + " [SEP]" for s in sentences]
  return sentences

In [None]:
all_data = pd.read_csv(dataset_path)

if (technique == "humor"):
  all_data = all_data[['Sentence', 'Humor']]
elif (technique == "hate speech"):
  all_data = all_data[['Sentence', 'Hate_speech']]
else:
  all_data = all_data[['Sentence', 'Sentiment']]

all_data.columns = ['Sentence', 'Label']
all_data['Label'], uniq = pd.factorize(all_data['Label'])

X = all_data['Sentence'].values.tolist()
y = all_data['Label'].values.tolist()

# X = preprocess_texts(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = random_state)

In [None]:
if oversample_dataset:
  X_train = np.array(X_train).reshape(-1, 1)
  X_train, y_train = apply_oversampling(X_train, y_train)
  X_train = [x[0] for x in X_train.tolist()]
# y_train = y_train.tolist()

##### **Preprocess Data**

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base", do_lower_case=True) #######################################################uncased

In [None]:
encoded_X_train = tokenizer(X_train, truncation=True, padding=True, max_length=MAX_LEN)
encoded_X_test = tokenizer(X_test, truncation=True, padding=True, max_length=MAX_LEN)

In [None]:
class DatasetObject(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DatasetObject(encoded_X_train, y_train)
test_dataset = DatasetObject(encoded_X_test, y_test)

In [None]:
train_sampler = RandomSampler(train_dataset)
validation_sampler = SequentialSampler(test_dataset)
train_loader = DataLoader(train_dataset, sampler=train_sampler , batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, sampler=validation_sampler , batch_size=BATCH_SIZE) 

### **Fine-Tuning**

#### **Initialize the model**

In [None]:
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=NO_OUTPUT_LAYERS)
model.cuda()
print("Done")

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

Done


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla K80'

In [None]:
import gc
# del all_data, encoded_X_train, encoded_X_test, X_train, X_test, tokenizer, DatasetObject
torch.cuda.empty_cache()
gc.collect()

525

#### **Fine-tuning in PyTorch with the Trainer API**

##### **Fine-tune & Test the model**

In [None]:
def compute_metrics(eval_pred):
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("f1")
    metric4 = load_metric("accuracy")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]
    macro_precision = metric1.compute(predictions=predictions, references=labels, average="macro")["precision"]
    macro_recall = metric2.compute(predictions=predictions, references=labels, average="macro")["recall"]
    macro_f1 = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
    return {"accuracy":accuracy, "precision": precision, "recall": recall, "f1": f1, "macro_precision": macro_precision, "macro_recall": macro_recall, "macro_f1": macro_f1}

In [None]:
# Default Hyperparameters
# training_args = TrainingArguments("test_trainer") 
# learning_rate=5e-5, batch_size=8,  weight_decay=0, num_train_epochs=3

training_args = TrainingArguments(
    learning_rate = LEARNING_RATE,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    output_dir = model_save_path,
    num_train_epochs = EPOCHS,
    metric_for_best_model="eval_macro_f1",
    load_best_model_at_end=True,
    save_strategy="epoch",
    evaluation_strategy="epoch"
)

In [None]:
trainer = Trainer(
    model = model, 
    args = training_args, 
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

**Train the model**

In [None]:
trainer.train()

***** Running training *****
  Num examples = 12166
  Num Epochs = 6
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2286


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Macro Precision,Macro Recall,Macro F1
1,No log,0.694349,0.701923,0.709078,0.701923,0.703068,0.472729,0.485939,0.477476
2,0.782500,0.595435,0.754438,0.765133,0.754438,0.753783,0.541857,0.523426,0.5249
3,0.562000,0.609634,0.778107,0.78619,0.778107,0.777761,0.556107,0.553365,0.550268
4,0.446200,0.564531,0.79068,0.790312,0.79068,0.788603,0.55292,0.554813,0.551889
5,0.446200,0.596185,0.798077,0.795871,0.798077,0.795641,0.683796,0.583107,0.596241
6,0.327200,0.654966,0.801775,0.804043,0.801775,0.801904,0.711875,0.639401,0.657265


***** Running Evaluation *****
  Num examples = 1352
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /content/drive/Shareddrives/FYP/Humor_HateSpeech_detection/XLMR/sentiment/1/checkpoint-381
Configuration saved in /content/drive/Shareddrives/FYP/Humor_HateSpeech_detection/XLMR/sentiment/1/checkpoint-381/config.json
Model weights saved in /content/drive/Shareddrives/FYP/Humor_HateSpeech_detection/XLMR/sentiment/1/checkpoint-381/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1352
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /content/drive/Shareddrives/FYP/Humor_HateSpeech_detection/XLMR/sentiment/1/checkpoint-762
Configuration saved in /content/drive/Shareddrives/FYP/Humor_HateSpeech_detection/XLMR/sentiment/1/checkpoint-762/config.json
Model weights save

TrainOutput(global_step=2286, training_loss=0.4978538339636562, metrics={'train_runtime': 3578.6169, 'train_samples_per_second': 20.398, 'train_steps_per_second': 0.639, 'total_flos': 4801599870971904.0, 'train_loss': 0.4978538339636562, 'epoch': 6.0})

**Test the fine-tuned model**

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1352
  Batch size = 32


{'epoch': 6.0,
 'eval_accuracy': 0.8017751479289941,
 'eval_f1': 0.8019036629624959,
 'eval_loss': 0.6549655795097351,
 'eval_macro_f1': 0.6572647652825141,
 'eval_macro_precision': 0.7118746106700379,
 'eval_macro_recall': 0.6394008390605737,
 'eval_precision': 0.8040434184713193,
 'eval_recall': 0.8017751479289941,
 'eval_runtime': 21.1625,
 'eval_samples_per_second': 63.887,
 'eval_steps_per_second': 2.032}

In [None]:
# predictions = trainer.predict(test_dataset)

In [None]:
# preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
# test_set = {'Sentence': X_test,
#         'Label':predictions.label_ids,
#         'Prediction':preds}
  
# # Create DataFrame
# df = pd.DataFrame(test_set)

In [None]:
# df

In [None]:
# model.save_pretrained("/content/drive/Shareddrives/FYP/final_models/xlmr-st/hate")

##### **Hyperparameter Search**

In [None]:
# args = TrainingArguments(
#     "test-glue",
#     evaluation_strategy = "epoch",
#     save_strategy = "epoch",
#     learning_rate=LEARNING_RATE,
#     per_device_train_batch_size=BATCH_SIZE,
#     per_device_eval_batch_size=BATCH_SIZE,
#     num_train_epochs=EPOCHS,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_macro_f1",
# )

In [None]:
# def my_hp_space(trial):
#     return {
#         # "learning_rate": trial.suggest_float("learning_rate", 1e-7, 1e-3, log=True),
#         "num_train_epochs": trial.suggest_discrete_uniform("num_train_epochs", 1, 5, 1),
#         # "seed": trial.suggest_int("seed", 1, 40),
#         # "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32, 64]),
#     }

In [None]:
# trainer = Trainer(
#     model_init=model_init,
#     args=args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     # tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

# best_run = trainer.hyperparameter_search(n_trials=5, direction="maximize", hp_space=my_hp_space)

#### **Fine tuning with native PyTorch**

**Fine-tune the model**

In [None]:
# def epoch_time(start_time, end_time):
#   elapsed_time = end_time - start_time
#   elapsed_mins = int(elapsed_time / 60)
#   elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
#   return elapsed_mins, elapsed_secs

In [None]:
# # apply different hyperpameters for specific parameter groups
# # param_optimizer = list(model.named_parameters())

# # optimizer_grouped_parameters = [
# #     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
# #      'weight_decay_rate': 0.01},
# #     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
# #      'weight_decay_rate': 0.0}
# # ]
# # optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
# optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
# train_loss_set = []
# num_training_steps = EPOCHS * len(train_loader)
# progress_bar = tqdm(range(num_training_steps))
# model.train()

# for epoch in range(EPOCHS):
#   start_time = time.time()

#   tr_loss = 0
#   nb_tr_steps = 0
  
#   for batch in train_loader:

#     batch = {k: v.to(device) for k, v in batch.items()}
#     outputs = model(**batch)
#     loss = outputs.loss
#     loss.backward()

#     optimizer.step()
#     optimizer.zero_grad()
#     progress_bar.update(1)

#     train_loss_set.append(loss.item())    
    
#     tr_loss += loss.item()
#     nb_tr_steps += 1

#   end_time = time.time()

#   print(epoch_time(start_time,end_time))
#   print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))

**Validate the model**

In [None]:
# accuracy = load_metric("accuracy")
# precision = load_metric("precision")
# recall = load_metric("recall")
# f1 = load_metric("f1")
# macro_precision = load_metric("precision")
# macro_recall = load_metric("recall")
# macro_f1 = load_metric("f1")

# model.eval()
# for batch in test_loader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     with torch.no_grad():
#         outputs = model(**batch)

#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     accuracy.add_batch(predictions=predictions, references=batch["labels"])
#     precision.add_batch(predictions=predictions, references=batch["labels"])
#     recall.add_batch(predictions=predictions, references=batch["labels"])
#     f1.add_batch(predictions=predictions, references=batch["labels"])
#     macro_precision.add_batch(predictions=predictions, references=batch["labels"])
#     macro_recall.add_batch(predictions=predictions, references=batch["labels"])
#     macro_f1.add_batch(predictions=predictions, references=batch["labels"])

# print(accuracy.compute())
# print(precision.compute(average="weighted"))
# print(recall.compute(average="weighted"))
# print(f1.compute(average="weighted"))
# print("macro averages:")
# print(macro_precision.compute(average="macro"))
# print(macro_recall.compute(average="macro"))
# print(macro_f1.compute(average="macro"))