<a href="https://colab.research.google.com/github/HimashiRathnayake/Hate-Speech-Humor-Detection/blob/main/XLM-R/XLM-R%20Humor%20Hate%20Speech%20Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fine Tune XLM-R 
Humor Detection & Hate speech Detection of Sinhala-English Code-Mixed Data.

### **Parameters**

**User Parameters**

In [None]:
technique = "hate speech" #@param ["humor", "hate speech"]
experiment_no = "1" #@param [] {allow-input: true}
over_sampling_technique = "ROS" #@param ["", "ROS","ADASYN", "SMOTE", "BorderlineSMOTE"]
sampling_strategy = "1:0.25:0.25" #@param [] {allow-input: true}

if technique == "humor" :
  NO_OUTPUT_LAYERS = 2
  tag_set = ["Humorous", "Non-Humorous"]
elif technique == "hate speech":
  NO_OUTPUT_LAYERS = 3
  tag_set = ["Abusive", "Hate-Inducing", "Not offensive"]

In [None]:
MAX_LEN = 128
BATCH_SIZE = 32
LEARNING_RATE = 5e-5 # Tried - 2e-5
EPOCHS = 5 # intial - 3

**Folder Paths**

In [None]:
dataset_path = "/content/drive/Shareddrives/FYP/corpus/çompleted_draft.csv"

**Dependencies**

In [None]:
# !pip install transformers
# !pip install sentencepiece
# # !pip install optuna

In [None]:
import re
import time
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaModel, XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE
# from transformers.optimization import get_linear_scheduler_with_warmup
% matplotlib inline



In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Oversampling**

In [None]:
def apply_oversampling(x, y):

  (unique, counts) = np.unique(y, axis=0, return_counts=True)
  print("Class Distribution Without Oversampling", counts)

  # define oversampling strategy
  if (over_sampling_technique == ""):
    return x, y
  elif (over_sampling_technique == "ROS"):
    if (technique=="humor"):
      oversample = RandomOverSampler(sampling_strategy = float(sampling_strategy))
    else:
      sampling_ratio = sampling_strategy.split(":");
      oversample = RandomOverSampler(ratio = {
          0:int(counts[0]*float(sampling_ratio[0])), 
          1:int(counts[0]*float(sampling_ratio[1])), 
          2:int(counts[0]*float(sampling_ratio[2]))
          })
  elif (over_sampling_technique == "ADASYN"):
    oversample = ADASYN(sampling_strategy="minority")
  elif (over_sampling_technique == "SMOTE"):
    oversample = SMOTE()
  elif (over_sampling_technique == "BorderlineSMOTE"):
    oversample = BorderlineSMOTE()

  # fit and apply the transform
  X_over, y_over = oversample.fit_resample(x, y)

  (unique, counts) = np.unique(y_over, axis=0, return_counts=True)
  print("Class Distribution After Oversampling", counts)

  return X_over, y_over

### **Load & Preprocess Dataset**

In [None]:
# import gc
# del variables
# gc.collect()

torch.cuda.empty_cache()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
# torch.cuda.get_device_name(0)

In [None]:
def preprocess_texts(sentences):
  sentences = [re.sub(r'http\S+','',s) for s in sentences]
  sentences = [s.replace('#','') for s in sentences]
  sentences = ["[CLS] " + s + " [SEP]" for s in sentences]
  return sentences

In [None]:
df_train = pd.read_csv(dataset_path)

if (technique == "humor"):
  df_train = df_train[['Sentence', 'Humor']]
else:
  df_train = df_train[['Sentence', 'Hate_speech']]

df_train.columns = ['Sentence', 'Label']

df_train['Label'], uniq = pd.factorize(df_train['Label'])

X = df_train['Sentence'].values.tolist()
y = df_train['Label'].values.tolist()
# X = preprocess_texts(X) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=0)

X_train = np.array(X_train).reshape(-1, 1)
X_train, y_train = apply_oversampling(X_train, y_train)
X_train = [x[0] for x in X_train.tolist()]

Class Distribution Without Oversampling [11030   315   821]


In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', do_lower_case=True)

In [None]:
tokenized_texts_train = [tokenizer.tokenize(s) for s in X_train]
tokenized_texts_val = [tokenizer.tokenize(s) for s in X_test]

input_ids_train = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts_train]
input_ids_val = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts_val]

input_ids_train = pad_sequences(input_ids_train, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
input_ids_val = pad_sequences(input_ids_val, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks_train = []
attention_masks_val = []

for seq in input_ids_train:
  seq_mask = [float(i>0) for i in seq]
  attention_masks_train.append(seq_mask)

for seq in input_ids_val:
  seq_mask = [float(i>0) for i in seq]
  attention_masks_val.append(seq_mask)
  
train_labels, validation_labels = y_train, y_test
  
train_inputs = torch.tensor(input_ids_train)
validation_inputs = torch.tensor(input_ids_val)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(attention_masks_train)
validation_masks = torch.tensor(attention_masks_val)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

### **Fine-Tuning**

#### **Initialize the model**

In [None]:
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=4)
model.cuda()

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

#### **Fine tuning with native PyTorch**

**Fine-tune the model**

In [None]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [None]:
# param_optimizer = list(model.named_parameters())
# no_decay = ['bias', 'gamma', 'beta']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
#      'weight_decay_rate': 0.01},
#     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
#      'weight_decay_rate': 0.0}
# ]

# optimizer = AdamW(optimizer_grouped_parameters,lr=LEARNING_RATE)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
train_loss_set = []

for _ in trange(EPOCHS, desc="Epoch"):
  start_time = time.time()
  model.train()
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  for step, batch in enumerate(train_dataloader):

    batch = tuple(t.to(device) for t in batch)
  
    b_input_ids, b_input_mask, b_labels = batch

    optimizer.zero_grad()
  
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    
    loss = outputs[0]
    logits = outputs[1]
    train_loss_set.append(loss.item())    
    
    loss.backward()

    optimizer.step()
    
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
  end_time = time.time()

  print(epoch_time(start_time,end_time))

  print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))
  # print("\nTrain accuracy : {}".format(100 * correct / total))

Epoch:  20%|██        | 1/5 [08:03<32:15, 483.85s/it]

(8, 3)

Train loss: 0.37594354749117


Epoch:  40%|████      | 2/5 [16:06<24:09, 483.13s/it]

(8, 2)

Train loss: 0.3709780712883303


Epoch:  60%|██████    | 3/5 [24:09<16:06, 483.01s/it]

(8, 2)

Train loss: 0.3706330099482862


Epoch:  80%|████████  | 4/5 [32:12<08:02, 482.93s/it]

(8, 2)

Train loss: 0.368748126810617


Epoch: 100%|██████████| 5/5 [40:14<00:00, 482.96s/it]

(8, 2)

Train loss: 0.3689935308663551





**Validate the model**

In [None]:
preds = []
with torch.no_grad():
  correct = 0
  total = 0
  for i, batch in enumerate(validation_dataloader):
    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_labels = batch
    
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    prediction = torch.argmax(outputs[0],dim=1)
    preds.append(prediction)
    total += b_labels.size(0)
    correct+=(prediction==b_labels).sum().item()

final_preds = []
for tensor in preds:
  for pred in tensor:
    final_preds.append(int(pred))

print(classification_report(y_test,final_preds))
print("Accuracy", accuracy_score(y_test,final_preds))
print("Precision", precision_score(y_test,final_preds, average="weighted"))
print("Recall", recall_score(y_test,final_preds, average="weighted"))
print("F1-Score", f1_score(y_test,final_preds, average="weighted"))
print("Macro Precision", precision_score(y_test,final_preds, average="macro"))
print("Macro Recall", recall_score(y_test,final_preds, average="macro"))
print("Macro F1", f1_score(y_test,final_preds, average="macro"))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1232
           1       0.00      0.00      0.00        33
           2       0.00      0.00      0.00        87

    accuracy                           0.91      1352
   macro avg       0.30      0.33      0.32      1352
weighted avg       0.83      0.91      0.87      1352

Accuracy 0.9112426035502958
Precision 0.8303630825251216
Recall 0.9112426035502958
F1-Score 0.8689248355835638
Macro Precision 0.3037475345167653
Macro Recall 0.3333333333333333
Macro F1 0.31785345717234265


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
