<a href="https://colab.research.google.com/github/santarabantoosoo/Omdena-seniment-analysis/blob/fine_tunning_models/Models/AraBERTv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#installing dependencies

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4
Mon Jun  7 21:34:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P8    10W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------

In [2]:
!pip install optuna==2.3.0
!pip install transformers==4.2.1
!pip install farasapy
!pip install pyarabic
!git clone https://github.com/aub-mind/arabert

Collecting optuna==2.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/87/10/06b58f4120f26b603d905a594650440ea1fd74476b8b360dbf01e111469b/optuna-2.3.0.tar.gz (258kB)
[K     |█▎                              | 10kB 25.8MB/s eta 0:00:01[K     |██▌                             | 20kB 33.2MB/s eta 0:00:01[K     |███▉                            | 30kB 36.7MB/s eta 0:00:01[K     |█████                           | 40kB 37.4MB/s eta 0:00:01[K     |██████▍                         | 51kB 34.6MB/s eta 0:00:01[K     |███████▋                        | 61kB 35.6MB/s eta 0:00:01[K     |████████▉                       | 71kB 32.2MB/s eta 0:00:01[K     |██████████▏                     | 81kB 32.9MB/s eta 0:00:01[K     |███████████▍                    | 92kB 34.4MB/s eta 0:00:01[K     |████████████▊                   | 102kB 34.2MB/s eta 0:00:01[K     |██████████████                  | 112kB 34.2MB/s eta 0:00:01[K     |███████████████▏                | 122kB 34.2MB/s 

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Creating training datasets

In [4]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
all_datasets= []

In [5]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [6]:
DATA_COLUMN = "cleaned_text"
LABEL_COLUMN = "Class_camel"

##HARD - Balanced

In [13]:
%%bash 
cd drive/MyDrive/omdena/Final_Dataset/Dataset

ls

test.csv
train.csv
val.csv


In [15]:
df = pd.read_csv('drive/MyDrive/omdena/Final_Dataset/Dataset/train.csv')

df = df[[DATA_COLUMN, LABEL_COLUMN]]  # we are interested in rating and review only
# df_HARD.columns = [DATA_COLUMN, LABEL_COLUMN]
print(df[LABEL_COLUMN].value_counts())


positive    57096
negative    33702
neutral     20124
Name: Class_camel, dtype: int64


In [16]:
val_df=pd.read_csv('drive/MyDrive/omdena/Final_Dataset/Dataset/val.csv')
val_df = val_df[[DATA_COLUMN, LABEL_COLUMN]]

In [17]:
# code rating as +ve if > 3, -ve if less, no 3s in dataset
categories = ['neutral', 'negative', 'positive'] #classes present in the data
data_Hard = Dataset("HARD", df, val_df, categories)
all_datasets.append(data_Hard)

#Trainer

In [18]:
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna 

In [19]:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [20]:
for x in all_datasets:
  print(x.name)

HARD


You can choose which model, and dataset from here along with the max sentence length

In [21]:
dataset_name = 'HARD'
model_name = 'aubmindlab/bert-base-arabertv2'
task_name = 'classification'
max_len = 256

In [22]:
for d in all_datasets:
  if d.name==dataset_name:
    selected_dataset = d
    print('Dataset found')
    break

Dataset found


In [23]:
arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])

selected_dataset.train[DATA_COLUMN] = selected_dataset.train[DATA_COLUMN].apply(arabert_prep.preprocess)
selected_dataset.test[DATA_COLUMN] = selected_dataset.test[DATA_COLUMN].apply(arabert_prep.preprocess)  





100%|██████████| 241M/241M [00:20<00:00, 12.1MiB/s]



100%|██████████| 241M/241M [00:39<00:00, 12.1MiB/s]

In [24]:
selected_dataset.test[DATA_COLUMN]

0                               حقوق ال+ مراه في ال+ إسلام
1                اومن حقوق ال+ مراه +ان +ك تفتحلها ال+ باب
2        نعم ال+ مراه في ال+ إسلام منتقص +ه ال+ حقوق و+...
3        محاول +ات بائس +ه ل+ اقصاء دور ال+ أم السعوديه...
4                                       ناصر حقوق ال+ مراه
                               ...                        
27793    رحم الله شهداء ال+ واجب ال+ وطني علي حدود +نا ...
27794                   باقي أيام اللهم بلغ +نا رمضان أمين
27795                                 الله حي ال+ تاني جاي
27796    عبد ال+ منعم أبو ال+ فتوح في علمتني ال+ حياه ا...
27797    عبدالمنعم عطي +ه عضو مجلس ادار +ه نادي دمنهورو...
Name: cleaned_text, Length: 27798, dtype: object

In [26]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [27]:
label_map = { v:index for index, v in enumerate(selected_dataset.label_list) }
print(label_map)
train_dataset = BERTDataset(selected_dataset.train[DATA_COLUMN].to_list(),selected_dataset.train[LABEL_COLUMN].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(selected_dataset.test[DATA_COLUMN].to_list(),selected_dataset.test[LABEL_COLUMN].to_list(),model_name,max_len,label_map)

{'neutral': 0, 'negative': 1, 'positive': 2}


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=384.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=719993.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2306039.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=611.0, style=ProgressStyle(description_…




In [28]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [29]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[0,1])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

#Regular Training

This paert allows you to do a regular training with no hyper parameter optimization

In [30]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 5e-5
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8


steps_per_epoch = (len(selected_dataset.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 42
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

3466
27728


In [31]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=543490667.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

In [32]:
trainer.train()

Epoch,Training Loss,Validation Loss,Macro F1,Macro F1 Pos Neg,Macro Precision,Macro Recall,Accuracy,Runtime,Samples Per Second
0,0.4609,0.432618,0.786237,0.738651,0.817406,0.770362,0.828153,173.9428,159.811
1,0.351,0.385425,0.817006,0.776875,0.822494,0.813098,0.847975,173.8482,159.898


Epoch,Training Loss,Validation Loss,Macro F1,Macro F1 Pos Neg,Macro Precision,Macro Recall,Accuracy,Runtime,Samples Per Second
0,0.4609,0.432618,0.786237,0.738651,0.817406,0.770362,0.828153,173.9428,159.811
1,0.351,0.385425,0.817006,0.776875,0.822494,0.813098,0.847975,173.8482,159.898
2,0.2172,0.468155,0.819539,0.779189,0.82045,0.818765,0.84909,174.0568,159.706
3,0.1302,0.574959,0.819778,0.780156,0.824439,0.816205,0.850277,173.9572,159.798


KeyboardInterrupt: ignored

In [33]:
trainer.save_model('drive/MyDrive/omdena/Saved_models/AraBERTv2/')

In [34]:
import os

In [35]:
if not os.path.exists('drive/MyDrive/omdena/Saved_models/AraBERTv2/'):
  os.mkdir('drive/MyDrive/omdena/Saved_models/AraBERTv2/')

In [36]:
test_df=pd.read_csv('drive/MyDrive/omdena/Final_Dataset/Dataset/test.csv')
test_df = val_df[[DATA_COLUMN, LABEL_COLUMN]]

In [37]:
test_df[DATA_COLUMN] = test_df[DATA_COLUMN].apply(arabert_prep.preprocess)



[2021-06-08 00:24:38,452 - farasapy_logger - ERROR]: pipe broke! error code and message: [[Errno 32] Broken pipe]. reinitailize the process.., This may take sometime depending on the running task


In [38]:
test_dataset = BERTDataset(test_df[DATA_COLUMN].to_list(),test_df[LABEL_COLUMN].to_list(),model_name,max_len,label_map)


In [39]:
trainer.evaluate(test_dataset)

Epoch,Training Loss,Validation Loss,Macro F1,Macro F1 Pos Neg,Macro Precision,Macro Recall,Accuracy,Runtime,Samples Per Second
0,0.4609,0.432618,0.786237,0.738651,0.817406,0.770362,0.828153,173.9428,159.811
1,0.351,0.385425,0.817006,0.776875,0.822494,0.813098,0.847975,173.8482,159.898
2,0.2172,0.468155,0.819539,0.779189,0.82045,0.818765,0.84909,174.0568,159.706
3,0.1302,0.574959,0.819778,0.780156,0.824439,0.816205,0.850277,173.9572,159.798
4,0.1302,0.869043,0.793284,0.750514,0.788328,0.799241,0.823656,174.7801,159.046


{'eval_accuracy': 0.8236563781567019,
 'eval_loss': 0.8690428137779236,
 'eval_macro_f1': 0.7932839264625002,
 'eval_macro_f1_pos_neg': 0.7505136227234928,
 'eval_macro_precision': 0.7883280377213934,
 'eval_macro_recall': 0.7992410216293049,
 'eval_runtime': 174.7801,
 'eval_samples_per_second': 159.046}