# PyTorch Setup if a GPU is available

In [2]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: Tesla T4


# Installing the necessary packages

In [4]:
! pip install arabert
! pip install transformers
! pip install farasapy
! pip install pyarabic

Collecting arabert
  Downloading arabert-1.0.1-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Collecting emoji==1.4.2
  Downloading emoji-1.4.2.tar.gz (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25ldone
[?25h  Created wheel for emoji: filename=emoji-1.4.2-py3-none-any.whl size=186469 sha256=5a240060c0f8b886d68142bdb17451d401ad78fa23aebb82b5b7568e01964b8d
  Stored in directory: /root/.cache/pip/wheels/e4/61/e7/2fc1ac8f306848fc66c6c013ab511f0a39ef4b1825b11363b2
Successfully built emoji
Installing collected packages: emoji, farasapy, arabert
  Attempting uninstall: e

In [5]:
import pandas as pd
import numpy as np

In [7]:
dataset

Unnamed: 0,text,dialect
0,ياخي المدرب أختاره والمدرب دخله والمدرب بارك ا...,AE
1,شو الي قاعد يجري فالنصر يا أخوه خسر المباراة س...,AE
2,الي يبحث عن مشكلة الوصل راح يحصلها فالجولان,AE
3,انا مش معترض على تغير عامر الي دخل مكان عامر ا...,AE
4,تراجع مخيف في مستوى الحارس الكبير ماجد ناصر مش...,AE
...,...,...
363107,هل لك أن تسمح بالتقاط صورة لك معنا,MSA
363108,هل لديك سمك مقلي,MSA
363109,هل يمكني استئجار مرشدا يتحدث الغة اليابانية,MSA
363110,أيهما تعتقد أنها أفضل كهدية لصبي عمره ثماني سنوات,MSA


In [8]:
map_label={
    'EG':0,
    'SY':1,
    'PL':2,
    'KW':3,
    'LB':4,
    'LY':5,
    'JO':6,
    'DZ':7,
    'QA':8,
    'AE':9,
    'BH':10,
    'SA':11,
    'OM':12,
    'MA':13,
    'IQ':14,
    'TN':15,
    'SD':16,
    'YE':17,
    'MSA':18
}
label_map={
    0:'EG',
    1:'SY',
    2:'PL',
    3:'KW',
    4:'LB',
    5:'LY',
    6:'JO',
    7:'DZ',
    8:'QA',
    9:'AE',
    10:'BH',
    11:'SA',
    12:'OM',
    13:'MA',
    14:'IQ',
    15:'TN',
    16:'SD',
    17:'YE',
    18:'MSA'
}

In [9]:
from arabert.preprocess import ArabertPreprocessor
model_name="bert-base-arabert"
arabert_prep = ArabertPreprocessor(model_name=model_name)



100%|██████████| 241M/241M [00:25<00:00, 9.37MiB/s] 




In [12]:
dataset=dataset.rename(columns={'country':'dialect','tweet':'text'})
test_data=test_data.rename(columns={'tweet':'text','country':'dialect'})

In [18]:
dataset["text"]=dataset["text"].apply(lambda x:arabert_prep.preprocess(x))

In [19]:

test_data["text"]=test_data["text"].apply(lambda x:arabert_prep.preprocess(x))

# Importing the necessary packages

In [20]:
from arabert.preprocess import ArabertPreprocessor
from sklearn.metrics import (accuracy_score, f1_score,recall_score)
from torch.utils.data import  Dataset
from transformers import (AutoConfig, AutoModelForSequenceClassification,
                        AutoTokenizer, BertTokenizer, Trainer,
                        TrainingArguments)
from transformers.data.processors.utils import InputFeatures

In [21]:
#chose bert model
model_name = 'aubmindlab/bert-base-arabert'
#asafaya/bert-base-arabic
#UBC-NLP/ARBERT
#UBC-NLP/MARBERT
#bert-base-multilingual-uncased
num_labels = 19
max_length = 120

## To work using PyTorch we need to create a classification dataset to load the data

In [23]:
class ClassificationDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(ClassificationDataset).__init__()

      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())
        
      inputs = self.tokenizer(
          text,
          max_length=self.max_len,
          padding='max_length',
          truncation=True
        )      
      return InputFeatures(**inputs,label= self.target[item])

In [24]:

dataset['dialect'] = dataset['dialect'].map(map_label)
test_data['dialect'] = test_data['dialect'].map(map_label)


In [25]:
test_data=test_data[test_data['dialect'].isnull()==False]
dataset=dataset[dataset['dialect'].isnull()==False]

In [26]:
dataset['dialect'] = dataset['dialect'].astype(int)
test_data['dialect'] = test_data['dialect'].astype(int)

In [27]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 363112 entries, 0 to 363111
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   text     363112 non-null  object
 1   dialect  363112 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 8.3+ MB


## Creating datasets

In [29]:
train_dataset = ClassificationDataset(
    dataset['text'].to_list(),
    dataset['dialect'].to_list(),
    model_name,
    max_len,
    map_label
)
test_dataset = ClassificationDataset(
    test_data['text'].to_list(),
    test_data['dialect'].to_list(),
    model_name,
    max_len,
    map_label
)

Downloading:   0%|          | 0.00/637 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/700k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.15M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

## Create a function that return a pretrained model ready to do classification

In [30]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=num_labels)

## Metrics

In [31]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {       
      'macro_f1' : macro_f1,
      'accuracy': acc,
      'recall':macro_recall
  }

## Training arguments

In [32]:
training_args = TrainingArguments( 
    output_dir= "./train",    
    adam_epsilon = 1e-8,
    learning_rate = 2e-5,
    fp16 = True, # enable this when using V100 or T4 GPU
    per_device_train_batch_size = 16, # up to 64 on 16GB with max len of 128
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps = 4, # use this to scale batch size without needing more memory
    num_train_epochs= 15,
    warmup_ratio = 0,
    do_eval = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    save_total_limit=10,
    load_best_model_at_end = True, # this allows to automatically get the best model at the end based on whatever metric we want
    metric_for_best_model = 'eval_loss',
    greater_is_better = False,
    report_to=[]
  )


## Creating the trainer

In [33]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Downloading:   0%|          | 0.00/518M [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabert were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

# Tarining

In [34]:
trainer.train()

***** Running training *****
  Num examples = 363112
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 42555


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,Recall
1,1.588,1.676584,0.481332,0.485869,0.48782
2,1.3463,1.517412,0.536832,0.535255,0.538757
3,1.189,1.447096,0.569164,0.568084,0.572601
4,1.0576,1.381334,0.597243,0.604339,0.608423
5,0.9413,1.327311,0.62165,0.62632,0.630599
6,0.8366,1.283142,0.644666,0.652583,0.657747
7,0.7446,1.300745,0.654798,0.661148,0.665859
8,0.655,1.287195,0.664775,0.669997,0.674801
9,0.5854,1.315942,0.673219,0.679989,0.685364
10,0.5216,1.305158,0.684211,0.692264,0.697427


***** Running Evaluation *****
  Num examples = 3503
  Batch size = 32
Saving model checkpoint to ./train/checkpoint-2837
Configuration saved in ./train/checkpoint-2837/config.json
Model weights saved in ./train/checkpoint-2837/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3503
  Batch size = 32
Saving model checkpoint to ./train/checkpoint-5674
Configuration saved in ./train/checkpoint-5674/config.json
Model weights saved in ./train/checkpoint-5674/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3503
  Batch size = 32
Saving model checkpoint to ./train/checkpoint-8511
Configuration saved in ./train/checkpoint-8511/config.json
Model weights saved in ./train/checkpoint-8511/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3503
  Batch size = 32
Saving model checkpoint to ./train/checkpoint-11348
Configuration saved in ./train/checkpoint-11348/config.json
Model weights saved in ./train/checkpoint-11348/pytorch_model.bin
***** Running

TrainOutput(global_step=42555, training_loss=0.7771059808741164, metrics={'train_runtime': 39708.0262, 'train_samples_per_second': 137.168, 'train_steps_per_second': 1.072, 'total_flos': 3.359297960323488e+17, 'train_loss': 0.7771059808741164, 'epoch': 15.0})

## Saving the model

In [None]:

#you can chose the model from checkpoint
trainer.model.config.label2id = map_label
trainer.model.config.id2label = label_map
trainer.save_model("./model")
train_dataset.tokenizer.save_pretrained("./model")