In [None]:
# # From paper
# !pip install farasapy==0.0.14
# !pip install transformers==4.12.2
# !git clone https://github.com/aub-mind/arabert
# !pip install pyarabic==0.6.14
# !pip install sentencepiece==0.1.96
# !pip install emoji==1.6.1

In [None]:
import torch
import pandas as pd
import numpy as np
from typing import List
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split

In [3]:
tx="text"
st="stance"

In [None]:
# model from hugging face
# trained on twitter data
model_name = 'aubmindlab/bert-base-arabertv02-twitter'

train = pd.read_csv("./Dataset/train.csv")
train = train[[tx,st]]

print(train[st].value_counts())
map = {
    1: 'POS',
    0: 'NEU',
    -1: 'NEG',
}
train[st] = train[st].apply(lambda x: map[x])

test = pd.read_csv('./Dataset/dev.csv')
test = test[[tx,st]]
test[st] = test[st].apply(lambda x: map[x])

print(test.head())

label_list = test[st].unique()
print(label_list)

# Training

Start the training procedure

In [4]:
import numpy as np
import torch
import random
import matplotlib.pyplot as plt
import copy

from arabert.preprocess import ArabertPreprocessor
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from torch.utils.data import DataLoader, Dataset
from transformers import (AutoConfig, AutoModelForSequenceClassification,
                          AutoTokenizer, BertTokenizer, Trainer,
                          TrainingArguments)
from transformers.data.processors.utils import InputFeatures

Create and apply preprocessing using the AraBERT processor

In [None]:
arabic_prep = ArabertPreprocessor(model_name)

# apply preprocessing to the dataset
train[tx] = train[tx].apply(lambda x: arabic_prep.preprocess(x))
test[tx] = test[tx].apply(lambda x: arabic_prep.preprocess(x))

In [None]:
# check
print(train[tx][0:10])

Now we need to check the tokenized sentence length to decide on the maximum sentence length value

In [None]:
tok = AutoTokenizer.from_pretrained(model_name)

In [None]:
print("Training Sentence Lengths: ")
plt.hist([ len(tok.tokenize(sentence)) for sentence in train[tx].to_list()],bins=range(0,128,2))
plt.show()

print("Testing Sentence Lengths: ")
plt.hist([ len(tok.tokenize(sentence)) for sentence in test[tx].to_list()],bins=range(0,128,2))
plt.show()

In [None]:
max_len = 90

print("Truncated training sequences: ", sum([len(tok.tokenize(sentence)) > max_len for sentence in train[tx].to_list()]))

print("Truncated testing sequences: ", sum([len(tok.tokenize(sentence)) > max_len for sentence in test[tx].to_list()]))

dic = {}
x1 = train
for i in range(len(x1[tx])):
  if len(tok.tokenize(x1[tx][i])) > max_len:
    dic[x1[st][i]] = dic.get(x1[st][i], 0) +1
print(dic, ':(')

Create a function that return a pretrained model ready to do classification

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_list))

Define whatever metric you want here

In [None]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  #macro_precision = precision_score(p.label_ids,preds,average='macro')
  #macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {       
      'macro_f1' : macro_f1,
      'accuracy': acc
  }

In [None]:
def set_seed(seed=42):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic=True
  torch.backends.cudnn.benchmark = False

# Iterations

Ref:
https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments

In [None]:
training_args = TrainingArguments( 
    output_dir= "./train",    
    adam_epsilon = 1e-8,
    learning_rate = 1e-5,
    fp16 = False, # enable this when using V100 or T4 GPU
    per_device_train_batch_size = 16, # up to 64 on 16GB with max len of 128
    per_device_eval_batch_size = 128,
    gradient_accumulation_steps = 2, # use this to scale batch size without needing more memory
    num_train_epochs= 5,
    warmup_ratio = 0,
    do_eval = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True, # this allows to automatically get the best model at the end based on whatever metric we want
    metric_for_best_model = 'macro_f1',
    greater_is_better = True,
    seed = 1
  )

set_seed(training_args.seed)

Create the trainer

In [None]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train,
    eval_dataset=test,
    compute_metrics=compute_metrics,
)

In [None]:
#start the training
trainer.train()

### Saving The Model

In [None]:
inv_label_map = inv_label_map = { v:k for k, v in label_map.items()}
print(inv_label_map)
trainer.model.config.label2id = label_map
trainer.model.config.id2label = inv_label_map
fold = "arabert02_official_batch32_somesampling_658972f1"
trainer.save_model(fold)
train_dataset.tokenizer.save_pretrained(fold)

##### Saving to Drive

In [None]:
# copy the model to drive
# from google.colab import drive
# drive.mount('/content/drive')

# !cp -r arabert02_official_original_652f1 /content/drive/MyDrive
# !ls '/content/drive/MyDrive'

# Prediction

#### Load Model

In [None]:
from google.colab import drive
from transformers import pipeline
# initialize pipline
drive.mount('./drive')
model_name = "/content/drive/MyDrive/NLP/arabert02_official_original_652f1"
hf_name = "aubmindlab/bert-base-arabertv02-twitter"
pipe = pipeline("sentiment-analysis", model=model_name, device=0, return_all_scores=False)

### Dev set, make sure model is saved and loaded correctly

In [None]:
test = pd.read_csv('./dev.csv')
test = test[[tx,st]]
map = {
    1: 'POS',
    0: 'NEU',
    -1: 'NEG',
}
test[st] = test[st].apply(lambda x: map[x])
# label_list = ['NEG', 'NEU', 'POS']
label_list = test[st].unique()
print(test.head())
print(test[st].value_counts())
print(label_list)

arabic_prep = ArabertPreprocessor(hf_name)
tok = AutoTokenizer.from_pretrained(hf_name)

In [None]:
# try pipe
# (pipe('sad')[0]['label'])

def predict(text):
  return pipe(text)[0]['label']

predict('sad')

In [None]:
from sklearn import metrics

X = test
labels = X['stance']
pred = []

for i, txi in enumerate(X.text):
    pred += [predict(txi)]

print(metrics.classification_report(labels, pred))

In [None]:
test = pd.read_csv('./test.csv')
map = {
    1: 'POS',
    0: 'NEU',
    -1: 'NEG',
}
label_list = ['NEG', 'NEU', 'POS']
print(test.head())
print(label_list)

arabic_prep = ArabertPreprocessor(hf_name)
tok = AutoTokenizer.from_pretrained(hf_name)

#### Test set

In [None]:
# from sklearn import metrics

X = test
# labels = X['stance']
pred = []

mp = {
    "POS": 1,
    "NEU": 0,
    "NEG": -1
}

for i, txi in enumerate(X.text):
    # pred.append((X.id[i], mp[pipe(arabic_prep.preprocess(txi))[0]['label']]))
    pred.append((X.id[i], mp[predict(txi)]))

print(pred[:15])
# print(pred.value_counts())
# print(metrics.classification_report(labels, pred))

#### CSV output

In [None]:
import csv
with open('./out/test_stance_NO_preprocess.csv', "w",encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(('id','stance'))
    for row in pred:
        writer.writerow(row)

dd = pd.read_csv('./out/test_stance_NO_preprocess.csv')
print(dd.info())
print(dd.head())

print(dd.stance.value_counts())

In [None]:
text = "بحمد الله تم أخذ الجرعة الأولى من #لقاح_كورونا https://t.co/cWiFmKfKhV"
print(pipe(text))

p = arabic_prep.preprocess(text)
print(p)

print(pipe(p))

tk = tok.tokenize(p)
print(tk)

#
pipe(tk)

In [None]:
pipe(["كو"])

In [None]:
# !rm -rf train

In [None]:
iasdhkjlafsd

# K-fold

This section is bit more advanced.

We will divide the training set into K-folds and train model with cross-validation to check for the best hyper-parameters before check the performance on the test set.

Alternatively, you can combine the training and testing set if you are participating in a competition, then ensemble the output models

In [None]:
# do kfold on the training. Check the perfomance on the test set
kfold_dataset = selected_dataset.train
# do kfold on all the dataset. Here we will not have any dataset to checl final performance on (this is used mainly in competitions)
# kfold_dataset = pd.concat([selected_dataset.train,selected_dataset.test])
kfold_dataset.reset_index(inplace=True,drop=True)

In [None]:
# this is used later
inv_label_map = { v:k for k, v in label_map.items()}

Defing the number of Stratified kfold splits

In [None]:
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=123
  )

Train using cross validation and save the best model at each fold

In [None]:
all_results = []
fold_best_f1 = 0
best_fold = None
for fold_num , (train, dev) in enumerate(kf.split(kfold_dataset,kfold_dataset['label'])):
  print("**************************Starting Fold Num: ", fold_num," **************************")
  
  train_dataset = ClassificationDataset(list(kfold_dataset[DATA_COLUMN][train]),
                              list(kfold_dataset[LABEL_COLUMN][train]),
                              model_name,
                              max_len,
                              label_map)
  
  val_dataset = ClassificationDataset(list(kfold_dataset[DATA_COLUMN][dev]),
                              list(kfold_dataset[LABEL_COLUMN][dev]),
                              model_name,
                              max_len,
                              label_map)
  
  training_args = TrainingArguments( 
    output_dir= f"./train_{fold_num}",    
    adam_epsilon = 1e-8,
    learning_rate = 2e-5,
    fp16 = False,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 128,
    gradient_accumulation_steps = 2,
    num_train_epochs= 2,
    warmup_ratio = 0,
    do_eval = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'macro_f1',
    greater_is_better = True,
    seed = 123
  )

  set_seed(training_args.seed)

  trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
  )
  trainer.model.config.label2id = label_map
  trainer.model.config.id2label = inv_label_map

  trainer.train()

  results = trainer.evaluate()
  all_results.append(results)
  print(results)

  trainer.save_model(f"./train_{fold_num}/best_model")
  val_dataset.tokenizer.save_pretrained(f"./train_{fold_num}/best_model")

  # delete the rest of the checkpoints
  !rm -rf f"./train_{fold_num}/checkpoint-*" 
  
  if results['eval_macro_f1'] > fold_best_f1:
    print('**************************New Best Model Found!**************************')
    fold_best_f1 = results['eval_macro_f1']
    best_fold = fold_num

In [None]:
all_results

In [None]:
from statistics import mean
mean([x['eval_macro_f1'] for x in all_results])

After checking for the best hyper parameters you should use the regular training section and retrain the model with the parameters that you had here.

Or Ensemble the models together.

## Ensemble all the cross validation models

In [None]:
from transformers import pipeline
import more_itertools

In [None]:
inv_label_map = { v:k for k, v in label_map.items()}

Load some file which has text that we need to run inference on. 
I will use the test set for that

In [None]:
# pred_df = prediction['Text']
# pred_df = pred_df.apply(lambda x:   arabic_prep.preprocess(x))

pred_df = selected_dataset.test[DATA_COLUMN]

In [None]:
cross_val_df = pd.DataFrame([])
for i in range(0,5):
  pipe = pipeline("sentiment-analysis", model=f"train_{i}/best_model", device=0, return_all_scores =True, max_length=max_len, truncation=True)
  preds = []
  for s in tqdm(more_itertools.chunked(list(pred_df), 32)): # batching for faster inference
    preds.extend(pipe(s))
  cross_val_df[f'model_{i}'] = preds

In [None]:
from collections import defaultdict

final_labels = []
final_scores = []
for id, row in cross_val_df.iterrows():
  total_score = defaultdict(lambda: 0)  
  for pred in row:
    for cls in pred:
      total_score[cls['label']] += cls['score']

  avg_score = { k: v/ 5 for k, v in total_score.items()}

  final_labels.append(max(avg_score, key=avg_score.get))
  final_scores.append(avg_score[max(avg_score, key=avg_score.get)])

In [None]:
cross_val_df['preds'] = final_labels 
cross_val_df['sentiment_score'] = final_scores 

In [None]:
cross_val_df['preds'].value_counts()

In [None]:
print(classification_report(selected_dataset.test[LABEL_COLUMN],cross_val_df['preds']))