#installing dependencies

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB
Sat May 29 12:00:19 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    27W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------

In [None]:
!pip install optuna==2.3.0
!pip install transformers==4.2.1
!pip install farasapy
!pip install pyarabic
!git clone https://github.com/aub-mind/arabert

Collecting optuna==2.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/87/10/06b58f4120f26b603d905a594650440ea1fd74476b8b360dbf01e111469b/optuna-2.3.0.tar.gz (258kB)
[K     |█▎                              | 10kB 22.7MB/s eta 0:00:01[K     |██▌                             | 20kB 18.1MB/s eta 0:00:01[K     |███▉                            | 30kB 15.2MB/s eta 0:00:01[K     |█████                           | 40kB 13.6MB/s eta 0:00:01[K     |██████▍                         | 51kB 7.7MB/s eta 0:00:01[K     |███████▋                        | 61kB 7.4MB/s eta 0:00:01[K     |████████▉                       | 71kB 8.4MB/s eta 0:00:01[K     |██████████▏                     | 81kB 8.9MB/s eta 0:00:01[K     |███████████▍                    | 92kB 9.1MB/s eta 0:00:01[K     |████████████▊                   | 102kB 7.4MB/s eta 0:00:01[K     |██████████████                  | 112kB 7.4MB/s eta 0:00:01[K     |███████████████▏                | 122kB 7.4MB/s eta 0:00

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Creating training datasets

In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
all_datasets= []

In [None]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [None]:
DATA_COLUMN = "cleaned_text"
LABEL_COLUMN = "Class_camel"

##HARD - Balanced

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Omdena_sentiment/Dataset/train.csv')

df = df[[DATA_COLUMN, LABEL_COLUMN]]  # we are interested in rating and review only
# df_HARD.columns = [DATA_COLUMN, LABEL_COLUMN]
print(df[LABEL_COLUMN].value_counts())


positive    57096
negative    33702
neutral     20124
Name: Class_camel, dtype: int64


In [None]:
val_df=pd.read_csv('/content/drive/MyDrive/Omdena_sentiment/Dataset/val.csv')
val_df = val_df[[DATA_COLUMN, LABEL_COLUMN]]

In [None]:
# code rating as +ve if > 3, -ve if less, no 3s in dataset
categories = ['neutral', 'negative', 'positive'] #classes present in the data
data_Hard = Dataset("HARD", df, val_df, categories)
all_datasets.append(data_Hard)

#Trainer

In [None]:
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna 

In [None]:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [None]:
for x in all_datasets:
  print(x.name)

HARD


You can choose which model, and dataset from here along with the max sentence length

In [None]:
dataset_name = 'HARD'
model_name = 'aubmindlab/bert-large-arabertv02'
task_name = 'classification'
max_len = 128

In [None]:
for d in all_datasets:
  if d.name==dataset_name:
    selected_dataset = d
    print('Dataset found')
    break

Dataset found


In [None]:
arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])

selected_dataset.train[DATA_COLUMN] = selected_dataset.train[DATA_COLUMN].apply(arabert_prep.preprocess)
selected_dataset.test[DATA_COLUMN] = selected_dataset.test[DATA_COLUMN].apply(arabert_prep.preprocess)  

In [None]:
selected_dataset.test[DATA_COLUMN]

0                                   حقوق المراه في الاسلام
1                       اومن حقوق المراه انك تفتحلها الباب
2        نعم المراه في الاسلام منتقصه الحقوق والوااجبات...
3        محاولات بائسه لاقصاء دور الام السعوديه ودورها ...
4                                         ناصر حقوق المراه
                               ...                        
27793    رحم الله شهداء الواجب الوطني علي حدودنا وادعو ...
27794                     باقي ايام اللهم بلغنا رمضان امين
27795                                   الله حي التاني جاي
27796    عبد المنعم ابو الفتوح في علمتني الحياه الجزء ا...
27797    عبدالمنعم عطيه عضو مجلس اداره نادي دمنهوروزراء...
Name: cleaned_text, Length: 27798, dtype: object

In [None]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [None]:
label_map = { v:index for index, v in enumerate(selected_dataset.label_list) }
print(label_map)
train_dataset = BERTDataset(selected_dataset.train[DATA_COLUMN].to_list(),selected_dataset.train[LABEL_COLUMN].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(selected_dataset.test[DATA_COLUMN].to_list(),selected_dataset.test[LABEL_COLUMN].to_list(),model_name,max_len,label_map)

{'neutral': 0, 'negative': 1, 'positive': 2}


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=384.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=824793.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2642362.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=381.0, style=ProgressStyle(description_…




In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [None]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[0,1])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

#Regular Training

This paert allows you to do a regular training with no hyper parameter optimization

In [None]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 5e-5
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8


steps_per_epoch = (len(selected_dataset.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 42
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

3466
27728


In [None]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=543490667.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Macro F1,Macro F1 Pos Neg,Macro Precision,Macro Recall,Accuracy,Runtime,Samples Per Second
0,0.457,0.421886,0.7945,0.749709,0.811993,0.782996,0.831499,253.9733,109.452
1,0.3471,0.419059,0.811983,0.769516,0.826872,0.806472,0.845888,253.735,109.555
2,0.2168,0.464089,0.820282,0.781177,0.823453,0.81744,0.849522,254.1328,109.384
3,0.127,0.558509,0.821143,0.7815,0.820236,0.823832,0.849054,254.1423,109.38
4,0.0802,0.672345,0.822831,0.78345,0.836231,0.812335,0.854054,254.1126,109.392
5,0.0426,0.872644,0.823913,0.783926,0.833597,0.81626,0.855529,254.1337,109.383
6,0.0249,0.957822,0.825621,0.786594,0.834959,0.818218,0.85668,254.135,109.383
7,0.0101,1.004895,0.82572,0.786448,0.832914,0.819526,0.856213,254.1301,109.385


TrainOutput(global_step=27728, training_loss=0.17174688206455713, metrics={'train_runtime': 25518.5519, 'train_samples_per_second': 1.087, 'total_flos': 184270884933528576, 'epoch': 8.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/Omdena_sentiment/Saved_models/AraBERT/")

In [None]:
import os

In [None]:
if not os.path.exists('/content/drive/MyDrive/Omdena_sentiment/Saved_models/AraBERT/'):
  os.mkdir('/content/drive/MyDrive/Omdena_sentiment/Saved_models/AraBERT/')

In [None]:
test_df=pd.read_csv('/content/drive/MyDrive/Omdena_sentiment/Dataset/test.csv')
test_df = val_df[[DATA_COLUMN, LABEL_COLUMN]]

In [None]:
test_df[DATA_COLUMN] = test_df[DATA_COLUMN].apply(arabert_prep.preprocess)



In [None]:
test_dataset = BERTDataset(test_df[DATA_COLUMN].to_list(),test_df[LABEL_COLUMN].to_list(),model_name,max_len,label_map)


In [None]:
trainer.evaluate(test_dataset)

{'epoch': 8.0,
 'eval_accuracy': 0.8562126771710195,
 'eval_loss': 1.004894733428955,
 'eval_macro_f1': 0.8257200735663509,
 'eval_macro_f1_pos_neg': 0.7864475386082397,
 'eval_macro_precision': 0.8329137531542009,
 'eval_macro_recall': 0.8195258109194047,
 'eval_runtime': 254.1375,
 'eval_samples_per_second': 109.382}