In [None]:
# don't forget to zip the dataset folder then upload it
!unzip "/content/Dataset.zip" 

Archive:  /content/Dataset.zip
   creating: Dataset/
  inflating: Dataset/dev.csv         
  inflating: Dataset/train.csv       


In [1]:
!pip install arabert
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import re
import random
from tqdm import tqdm
import numpy as np
# Can use this library to reload a specific module if the notebook can't see changes in the imported module
import importlib
import utils
from arabert.preprocess import ArabertPreprocessor
from bert import AraBERTDataset, BertClassifier, train

Stance detection labels meaning is as follows:
1. Positive (1): means that the tweet author encourages and supports vaccination.
2. Negative (-1): means that the tweet author refuses vaccination.
3. Neutral (0): means that the tweet neither supports nor refuses vaccination.

Category labels meaning is as follows:
1. Info_News: Information about vaccination.
2. Celebrities: mentioning celebrities taking vaccinations.
3. Plan: Governmental plan or progress of vaccination.
4. Request: Requests from governments regarding the vaccination process.
5. Rumor: the tweet is a rumor.
6. Advice: Advice related to the virus or the vaccination
7. Restriction: Restrictions due to the virus e.g. traveling.
8. Personal: Personal opinion or story about vaccination.
9. Unrelated: Unrelated to vaccination.
10.Others: Vaccination related but not one of the above.

In [3]:
t = pd.read_csv('/content/Dataset/train.csv')
d = pd.read_csv('/content/Dataset/dev.csv')

t.head()

Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,celebrity,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,info_news,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,info_news,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,celebrity,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,personal,0


## AraBERT model
- We will use aubmindlab/bert-base-arabertv02-twitter
- It's pre-trained on ~60M arabic tweets, and it encluded emojis in the training. 

#### Preprocess Data for Bert

In [4]:
# seperate train data from their labels
X = t['text']

Ys = t['stance']
# needed to map classes_ids from -1, 0, 1 to 2, 0, 1 since model gave error when received a negative label.
Ys = Ys.replace(utils.stance_classes_reverse)
Ys = Ys.replace(utils.stance_classes)

Yc = t['category']
Yc = Yc.replace(utils.category_classes)

# seperate validation data from their labels
X_val = d['text']

Ys_val = d['stance']
Ys_val = Ys_val.replace(utils.stance_classes_reverse)
Ys_val = Ys_val.replace(utils.stance_classes)

Yc_val = d['category']
Yc = Yc_val.replace(utils.category_classes)

#### Create Bert

In [5]:
import torch
model_name="aubmindlab/bert-base-arabertv02-twitter"

# load the preprocessing function they used to train their data on.
arabert_prep = ArabertPreprocessor(model_name=model_name)
X = X.apply(arabert_prep.preprocess)

# instantiate train and validation datasets
train_dataset = AraBERTDataset(X, Ys, model_name)
val_dataset = AraBERTDataset(X_val, Ys_val, model_name)

# create model
model = BertClassifier(model_name, n_classes=3)

# make the criterion cross entropy loss
criterion = torch.nn.CrossEntropyLoss()
# create the optimizer (Adam)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)


Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02-twitter were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dens

#### Train Bert

In [6]:
# Trianing for stance detection
train(model, train_dataset, val_dataset, criterion=criterion, optimizer=optimizer, classes_names=list(utils.stance_classes.keys()), n_classes=3, epochs=20)

Epochs: 1 | train Loss: 0.03308677300810814 | train Accuracy: 0.7959358900973097

Classification Report:               precision    recall  f1-score   support

     Neutral     0.5600    0.0830    0.1446      1012
    Positive     0.8017    0.9892    0.8856      5538
    Negative     0.0000    0.0000    0.0000       438

    accuracy                         0.7959      6988
   macro avg     0.4539    0.3574    0.3434      6988
weighted avg     0.7164    0.7959    0.7228      6988




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epochs: 1 | val Loss: 0.029323460534214973 | val Accuracy: 0.816

Classification Report:               precision    recall  f1-score   support

     Neutral     0.7619    0.1270    0.2177       126
    Positive     0.8172    0.9950    0.8974       804
    Negative     0.0000    0.0000    0.0000        70

    accuracy                         0.8160      1000
   macro avg     0.5264    0.3740    0.3717      1000
weighted avg     0.7530    0.8160    0.7489      1000




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epochs: 2 | train Loss: 0.029982255771756172 | train Accuracy: 0.7979393245563824

Classification Report:               precision    recall  f1-score   support

     Neutral     0.5122    0.1868    0.2737      1012
    Positive     0.8139    0.9727    0.8862      5538
    Negative     0.0000    0.0000    0.0000       438

    accuracy                         0.7979      6988
   macro avg     0.4420    0.3865    0.3867      6988
weighted avg     0.7192    0.7979    0.7420      6988




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epochs: 2 | val Loss: 0.028736717998981476 | val Accuracy: 0.814

Classification Report:               precision    recall  f1-score   support

     Neutral     0.7083    0.1349    0.2267       126
    Positive     0.8166    0.9913    0.8955       804
    Negative     0.0000    0.0000    0.0000        70

    accuracy                         0.8140      1000
   macro avg     0.5083    0.3754    0.3741      1000
weighted avg     0.7458    0.8140    0.7485      1000


Epochs: 3 | train Loss: 0.02892691269516945 | train Accuracy: 0.8103892386949055

Classification Report:               precision    recall  f1-score   support

     Neutral     0.6291    0.2480    0.3558      1012
    Positive     0.8213    0.9771    0.8925      5538
    Negative     1.0000    0.0023    0.0046       438

    accuracy                         0.8104      6988
   macro avg     0.8168    0.4091    0.4176      6988
weighted avg     0.8047    0.8104    0.7591      6988




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epochs: 3 | val Loss: 0.027989454567432404 | val Accuracy: 0.824

Classification Report:               precision    recall  f1-score   support

     Neutral     0.6481    0.2778    0.3889       126
    Positive     0.8340    0.9813    0.9017       804
    Negative     0.0000    0.0000    0.0000        70

    accuracy                         0.8240      1000
   macro avg     0.4941    0.4197    0.4302      1000
weighted avg     0.7522    0.8240    0.7740      1000


Epochs: 4 | train Loss: 0.028521854430437088 | train Accuracy: 0.8095306239267316

Classification Report:               precision    recall  f1-score   support

     Neutral     0.5875    0.2885    0.3870      1012
    Positive     0.8283    0.9662    0.8920      5538
    Negative     0.4516    0.0320    0.0597       438

    accuracy                         0.8095      6988
   macro avg     0.6225    0.4289    0.4462      6988
weighted avg     0.7698    0.8095    0.7667      6988




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epochs: 4 | val Loss: 0.02920948714017868 | val Accuracy: 0.819

Classification Report:               precision    recall  f1-score   support

     Neutral     0.6944    0.1984    0.3086       126
    Positive     0.8237    0.9876    0.8982       804
    Negative     0.0000    0.0000    0.0000        70

    accuracy                         0.8190      1000
   macro avg     0.5060    0.3953    0.4023      1000
weighted avg     0.7497    0.8190    0.7610      1000


Epochs: 5 | train Loss: 0.027911396697163582 | train Accuracy: 0.8129650829994276

Classification Report:               precision    recall  f1-score   support

     Neutral     0.6208    0.2895    0.3949      1012
    Positive     0.8286    0.9688    0.8932      5538
    Negative     0.5610    0.0525    0.0960       438

    accuracy                         0.8130      6988
   macro avg     0.6701    0.4369    0.4614      6988
weighted avg     0.7817    0.8130    0.7711      6988




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epochs: 5 | val Loss: 0.02930890955030918 | val Accuracy: 0.813

Classification Report:               precision    recall  f1-score   support

     Neutral     0.5424    0.2540    0.3459       126
    Positive     0.8300    0.9714    0.8951       804
    Negative     0.0000    0.0000    0.0000        70

    accuracy                         0.8130      1000
   macro avg     0.4574    0.4085    0.4137      1000
weighted avg     0.7356    0.8130    0.7633      1000


Epochs: 6 | train Loss: 0.02762594446539879 | train Accuracy: 0.8138236977676016

Classification Report:               precision    recall  f1-score   support

     Neutral     0.6347    0.2816    0.3901      1012
    Positive     0.8283    0.9698    0.8935      5538
    Negative     0.5636    0.0708    0.1258       438

    accuracy                         0.8138      6988
   macro avg     0.6756    0.4407    0.4698      6988
weighted avg     0.7837    0.8138    0.7725      6988


Epochs: 6 | val Loss: 0.029149074107408524 