<a href="https://colab.research.google.com/github/HimashiRathnayake/CMCS-Text-Classification/blob/main/XLM-R/Train_Task_Adapters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Training Single Task Adapters

### **Parameters**

In [1]:
technique = "hate speech" #@param ["humor", "hate speech"]
over_sampling_technique = "" #@param ["", "ROS","ADASYN", "SMOTE", "BorderlineSMOTE"]
sampling_strategy = "" #@param [] {allow-input: true} 
# eg: 1:0.25:0.25 for hate speech

### Installation

In [2]:
# !pip install -U adapter-transformers
# !pip install datasets
# !pip install sentencepiece

### Dependencies

In [3]:
from google.colab import drive
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaConfig, XLMRobertaModelWithHeads, TrainingArguments, AdapterTrainer, EvalPrediction, TextClassificationPipeline
import torch
from datasets import load_metric
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE

In [4]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Dataset Preprocessing

In [5]:
def apply_oversampling(x, y):

  (unique, counts) = np.unique(y, axis=0, return_counts=True)
  print("Class Distribution Without Oversampling", counts)

  # define oversampling strategy
  if (over_sampling_technique == ""):
    return x, y
  elif (over_sampling_technique == "ROS"):
    if (technique=="humor"):
      oversample = RandomOverSampler(sampling_strategy = float(sampling_strategy))
    else:
      sampling_ratio = sampling_strategy.split(":");
      oversample = RandomOverSampler(sampling_strategy = {
          0:int(counts[0]*float(sampling_ratio[0])), 
          1:int(counts[0]*float(sampling_ratio[1])), 
          2:int(counts[0]*float(sampling_ratio[2]))
          })
  elif (over_sampling_technique == "ADASYN"):
    oversample = ADASYN(sampling_strategy="minority")
  elif (over_sampling_technique == "SMOTE"):
    oversample = SMOTE()
  elif (over_sampling_technique == "BorderlineSMOTE"):
    oversample = BorderlineSMOTE()

  # fit and apply the transform
  X_over, y_over = oversample.fit_resample(x, y)

  (unique, counts) = np.unique(y_over, axis=0, return_counts=True)
  print("Class Distribution After Oversampling", counts)

  return X_over, y_over

In [6]:
dataset_path = "/content/drive/Shareddrives/FYP/corpus/çompleted_draft.csv"

In [7]:
all_data = pd.read_csv(dataset_path)

if (technique == "humor"):
  all_data = all_data[['Sentence', 'Humor']]
elif (technique == "hate speech"):
  all_data = all_data[['Sentence', 'Hate_speech']]
else:
  all_data = all_data[['Sentence', 'Offensive']]

all_data.columns = ['Sentence', 'Label']
all_data['Label'], uniq = pd.factorize(all_data['Label'])

X = all_data['Sentence'].values.tolist()
y = all_data['Label'].values.tolist()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 42)

# uncomment following only when applying oversampling
X_train = np.array(X_train).reshape(-1, 1)
X_train, y_train = apply_oversampling(X_train, y_train)
X_train = [x[0] for x in X_train.tolist()]
# y_train = y_train.tolist()

Class Distribution Without Oversampling [11036   314   816]
Class Distribution After Oversampling [11036  2759  2759]


In [9]:
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base", do_lower_case=True)

In [10]:
MAX_LEN = 128

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch, max_length=MAX_LEN, truncation=True, padding="max_length")

In [11]:
# Encode the input data
encoded_X_train = encode_batch(X_train)
encoded_X_test = encode_batch(X_test)

In [12]:
class DatasetObject(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DatasetObject(encoded_X_train, y_train)
test_dataset = DatasetObject(encoded_X_test, y_test)

### Training

In [13]:
if (technique == 'humor'):
    num_labels=2
    id2label={ 0: "Non-humorous", 1: "Humorous"}
elif (technique == 'hate speech'):
    num_labels=3
    id2label={ 0: "Not offensive", 1: "Hate-Inducing", 2: "Abusive"}

In [14]:
config = XLMRobertaConfig.from_pretrained(
    "xlm-roberta-base",
    num_labels= num_labels,
)

model = XLMRobertaModelWithHeads.from_pretrained(
    "xlm-roberta-base",
    config=config,
)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModelWithHeads were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for pr

In [15]:
# Add a new adapter
model.add_adapter("task_"+technique)

# Add a matching classification head
model.add_classification_head(
    "task_"+technique,
    num_labels=num_labels,
    id2label=id2label
  )

# Activate the adapter
model.train_adapter("task_"+technique)

In [16]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    # logging_steps=200,
    output_dir="./training_output",
    # overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

def compute_metrics(eval_pred):
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("f1")
    metric4 = load_metric("accuracy")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]
    macro_precision = metric1.compute(predictions=predictions, references=labels, average="macro")["precision"]
    macro_recall = metric2.compute(predictions=predictions, references=labels, average="macro")["recall"]
    macro_f1 = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
    return {"accuracy":accuracy, "precision": precision, "recall": recall, "f1": f1, "macro_precision": macro_precision, "macro_recall": macro_recall, "macro_f1": macro_f1}

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [17]:
trainer.train()

***** Running training *****
  Num examples = 16554
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1554


Step,Training Loss
500,0.7392
1000,0.5656
1500,0.4998


Saving model checkpoint to ./training_output/checkpoint-500
Configuration saved in ./training_output/checkpoint-500/task_hate speech/adapter_config.json
Module weights saved in ./training_output/checkpoint-500/task_hate speech/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-500/task_hate speech/head_config.json
Module weights saved in ./training_output/checkpoint-500/task_hate speech/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-500/task_hate speech/head_config.json
Module weights saved in ./training_output/checkpoint-500/task_hate speech/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-500/task_hate speech/head_config.json
Module weights saved in ./training_output/checkpoint-500/task_hate speech/pytorch_model_head.bin
Saving model checkpoint to ./training_output/checkpoint-1000
Configuration saved in ./training_output/checkpoint-1000/task_hate speech/adapter_config.json
Module weights saved in ./training_ou

TrainOutput(global_step=1554, training_loss=0.5981188127279589, metrics={'train_runtime': 1491.0865, 'train_samples_per_second': 33.306, 'train_steps_per_second': 1.042, 'total_flos': 3323327732411904.0, 'train_loss': 0.5981188127279589, 'epoch': 3.0})

In [18]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1352
  Batch size = 32


{'epoch': 3.0,
 'eval_accuracy': 0.8838757396449705,
 'eval_f1': 0.8965210679806407,
 'eval_loss': 0.31348752975463867,
 'eval_macro_f1': 0.6150233465267023,
 'eval_macro_precision': 0.5707390518497717,
 'eval_macro_recall': 0.7364296452675686,
 'eval_precision': 0.9164654593871178,
 'eval_recall': 0.8838757396449705,
 'eval_runtime': 20.6061,
 'eval_samples_per_second': 65.612,
 'eval_steps_per_second': 2.087}

In [19]:
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=training_args.device.index)

classifier("Lol😅")

[{'label': 'Not offensive', 'score': 0.7908138036727905}]

In [20]:
model.save_adapter("/content/drive/Shareddrives/FYP-CodeStars/Implementation/TrainedAdapters/task_adapter_"+technique, "task_"+technique)

Configuration saved in /content/drive/Shareddrives/FYP-CodeStars/Implementation/TrainedAdapters/task_adapter_hate speech/adapter_config.json
Module weights saved in /content/drive/Shareddrives/FYP-CodeStars/Implementation/TrainedAdapters/task_adapter_hate speech/pytorch_adapter.bin
Configuration saved in /content/drive/Shareddrives/FYP-CodeStars/Implementation/TrainedAdapters/task_adapter_hate speech/head_config.json
Module weights saved in /content/drive/Shareddrives/FYP-CodeStars/Implementation/TrainedAdapters/task_adapter_hate speech/pytorch_model_head.bin


In [21]:
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=training_args.device.index)

classifier("This is great!")

[{'label': 'Not offensive', 'score': 0.9991517066955566}]

## Load Adapters

In [None]:
# config = XLMRobertaConfig.from_pretrained(
#     "xlm-roberta-base",
#     num_labels=2,
# )

# model = XLMRobertaModelWithHeads.from_pretrained(
#     "xlm-roberta-base",
#     config=config,
# )

In [None]:
# model.load_adapter("/content/drive/Shareddrives/FYP-CodeStars/Implementation/TrainedAdapters/task_adapter_hate speech", with_head=False)
# model.set_active_adapters("task_"+technique)

In [None]:
# tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base", do_lower_case=True)
# classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)

In [None]:
# classifier("This is awesome!")