In [16]:
# You'll only need this cell if you run this notebook on colab
# don't forget to zip the dataset folder then upload it
# !rm -rf Dataset
# !unzip "/content/Dataset.zip" 

Archive:  /content/Dataset.zip
   creating: Dataset/
  inflating: Dataset/classification_train_sample1.csv  
  inflating: Dataset/dev.csv         
  inflating: Dataset/stance_train_sample1.csv  
  inflating: Dataset/train.csv       


In [None]:
!pip install arabert
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import re
import random
from tqdm import tqdm
import numpy as np
# Can use this library to reload a specific module if the notebook can't see changes in the imported module
import importlib
import utils
from arabert.preprocess import ArabertPreprocessor
from bert import AraBERTDataset, BertClassifier, train

  from .autonotebook import tqdm as notebook_tqdm


Stance detection labels meaning is as follows:
1. Positive (1): means that the tweet author encourages and supports vaccination.
2. Negative (-1): means that the tweet author refuses vaccination.
3. Neutral (0): means that the tweet neither supports nor refuses vaccination.

Category labels meaning is as follows:
1. Info_News: Information about vaccination.
2. Celebrities: mentioning celebrities taking vaccinations.
3. Plan: Governmental plan or progress of vaccination.
4. Request: Requests from governments regarding the vaccination process.
5. Rumor: the tweet is a rumor.
6. Advice: Advice related to the virus or the vaccination
7. Restriction: Restrictions due to the virus e.g. traveling.
8. Personal: Personal opinion or story about vaccination.
9. Unrelated: Unrelated to vaccination.
10.Others: Vaccination related but not one of the above.

In [4]:
t = pd.read_csv('./Dataset/stance_train_sample1.csv')
d = pd.read_csv('./Dataset/dev.csv')

t.head()

Unnamed: 0,text,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,0


In [5]:
# analyze dataset
print(t.info())
# d
# print(d.info())

# count for stance labels
print("####################################")
print("counts for each stance label :")
print(t['stance'].value_counts(normalize=True))
# d
print(d['stance'].value_counts(normalize=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16614 entries, 0 to 16613
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16614 non-null  object
 1   stance  16614 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 259.7+ KB
None
####################################
counts for each stance label :
 1    0.333333
 0    0.333333
-1    0.333333
Name: stance, dtype: float64
 1    0.804
 0    0.126
-1    0.070
Name: stance, dtype: float64


## AraBERT model
- We will use aubmindlab/bert-base-arabertv02-twitter
- It's pre-trained on ~60M arabic tweets, and it encluded emojis in the training. 

#### Preprocess Data for Bert

In [6]:
# seperate train data from their labels
X = t['text']

Ys = t['stance']
# needed to map classes_ids from -1, 0, 1 to 2, 0, 1 since model gave error when received a negative label.
Ys = Ys.replace(utils.stance_classes_reverse)
Ys = Ys.replace(utils.stance_classes)

# Yc = t['category']
# Yc = Yc.replace(utils.category_classes)

# seperate validation data from their labels
X_val = d['text']

Ys_val = d['stance']
Ys_val = Ys_val.replace(utils.stance_classes_reverse)
Ys_val = Ys_val.replace(utils.stance_classes)

# Yc_val = d['category']
# Yc = Yc_val.replace(utils.category_classes)

#### Create Bert

In [8]:
import torch
model_name="aubmindlab/bert-base-arabertv02-twitter"

# load the preprocessing function they used to train their data on.
arabert_prep = ArabertPreprocessor(model_name=model_name)
X = X.apply(arabert_prep.preprocess)

# instantiate train and validation datasets
train_dataset = AraBERTDataset(X, Ys, model_name)
val_dataset = AraBERTDataset(X_val, Ys_val, model_name)

# create model
model = BertClassifier(model_name, n_classes=3)

# make the criterion cross entropy loss
criterion = torch.nn.CrossEntropyLoss()

# create the optimizer (Adam)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)


Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02-twitter were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dens

#### Train Bert

In [24]:
# Trianing for stance detection
train(model, train_dataset, val_dataset, criterion=criterion, optimizer=optimizer, classes_names=list(utils.stance_classes.keys()), n_classes=3, epochs=20)

Epochs: 1 | train Loss: 0.04796833172440529 | train Accuracy: 0.6674491392801252 | train macro avg persision: {'precision': 0.6692294250348051, 'recall': 0.6674491392801252, 'f1-score': 0.667697570275341, 'support': 16614}

Classification Report:               precision    recall  f1-score   support

     Neutral     0.6245    0.6120    0.6181      5538
    Positive     0.7369    0.6900    0.7127      5538
    Negative     0.6463    0.7004    0.6723      5538

    accuracy                         0.6674     16614
   macro avg     0.6692    0.6674    0.6677     16614
weighted avg     0.6692    0.6674    0.6677     16614


Epochs: 1 | val Loss: 0.03928755223751068 | val Accuracy: 0.727 | val macro avg persision: {'precision': 0.5359176366223423, 'recall': 0.6665876964384427, 'f1-score': 0.5640528004463975, 'support': 1000}

Classification Report:               precision    recall  f1-score   support

     Neutral     0.3838    0.6032    0.4691       126
    Positive     0.9528    0.7537 