In [20]:
# !pip install camel-tools
# !pip install Arabic-Stopwords
# camel_data light

In [2]:
# built in modules
import re
import random
import numpy as np
import pandas as pd
from sklearn import metrics 

# .py files 
from preprocess import Preprocess
from feature_extraction import BOW
from feature_extraction import TFIDF
from feature_extraction import CBOW
from feature_extraction import SG

from classical_models import SVMmodel
# Can use this library to reload a specific module if the notebook can't see changes in the imported module
import importlib

Stance detection labels meaning is as follows:
1. Positive (1): means that the tweet author encourages and supports vaccination.
2. Negative (-1): means that the tweet author refuses vaccination.
3. Neutral (0): means that the tweet neither supports nor refuses vaccination.

Category labels meaning is as follows:
1. Info_News: Information about vaccination.
2. Celebrities: mentioning celebrities taking vaccinations.
3. Plan: Governmental plan or progress of vaccination.
4. Request: Requests from governments regarding the vaccination process.
5. Rumor: the tweet is a rumor.
6. Advice: Advice related to the virus or the vaccination
7. Restriction: Restrictions due to the virus e.g. traveling.
8. Personal: Personal opinion or story about vaccination.
9. Unrelated: Unrelated to vaccination.
10.Others: Vaccination related but not one of the above.

In [3]:
t = pd.read_csv('./Dataset/train.csv')
d = pd.read_csv('./Dataset/dev.csv')
preprocess = Preprocess(INCLUDE_EMOJIS=True)

Emojis:  True
Lemmatizor:  camel


In [4]:
# we have 9 categories so we have an array of len 9 for each y
def encode_category(y):
    '''
    Input: y a list of string labels for the category of each document 
    Output: a list of encoded 10 sized array for the category of each doc 
            for "other" category , it has an array =[0 0 0 0 0 0 0 0 0 1] 
    '''
    y_encoded=[]
    for element in y:
        # encoded= np.zeros(10,dtype=int)
        encoded=0
        if element=="info_news":
            # encoded[0]=1
            encoded=0
        elif element=="celebrity":
            # encoded[1]=1
            encoded=1
        elif element=="plan":
            # encoded[2]=1
            encoded=2
        elif element=="requests":
            # encoded[3]=1
            encoded=3
        elif element=="rumors":
            # encoded[4]=1
            encoded=4
        elif element=="advice":
            # encoded[5]=1
            encoded=5
        elif element=="restrictions":
            # encoded[6]=1
            encoded=6
        elif element=="personal":
            # encoded[7]=1
            encoded=7
        elif element=="unrelated":
            # encoded[8]=1
            encoded=8
        else:
            # encoded[9]=1
            encoded=9
        y_encoded.append(encoded)
    return y_encoded


### Understanding the Data
- 80% tweets are positive (class 1), the rest are neutral and negative.
- 50% tweets belongs to info_news category.

In [5]:
# analyze dataset
print(t.info())
# d
# print(d.info())

# count for each label
print("####################################")
print("counts for each lablel :")
print(t['category'].value_counts(normalize=True))
# same for d
# print(d['category'].value_counts(normalize=True))

# count for stance labels
print("####################################")
print("counts for each stance label :")
print(t['stance'].value_counts(normalize=True))
# d
print(d['stance'].value_counts(normalize=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6988 entries, 0 to 6987
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      6988 non-null   object
 1   category  6988 non-null   object
 2   stance    6988 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 163.9+ KB
None
####################################
counts for each lablel :
info_news       0.517459
personal        0.146680
celebrity       0.139525
plan            0.086720
unrelated       0.046222
others          0.023898
requests        0.016027
rumors          0.011305
advice          0.009588
restrictions    0.002576
Name: category, dtype: float64
####################################
counts for each stance label :
 1    0.792501
 0    0.144820
-1    0.062679
Name: stance, dtype: float64
 1    0.804
 0    0.126
-1    0.070
Name: stance, dtype: float64


In [6]:
X = t['text']
ys = t['stance']
yc = t['category']

In [7]:
yc[0:10]

0    celebrity
1    info_news
2    info_news
3    celebrity
4     personal
5    info_news
6    info_news
7     personal
8    unrelated
9    info_news
Name: category, dtype: object

In [8]:
yc=encode_category(yc)

In [9]:
# this is the array of 
yc[0:10]

[1, 0, 0, 1, 7, 0, 0, 7, 8, 0]

In [10]:
X_dev=d['text']
ys_dev=d['stance']
yc_dev=d['category']

In [11]:
yc_dev=encode_category(yc_dev)

In [12]:
# Preprocess the test data
X = X.apply(preprocess.do_all)

In [13]:
# Preprocess the dev data
X_dev = X_dev.apply(preprocess.do_all)

In [14]:
r = random.randint(0, len(X))
# r = 900
print(r, '\n')
print(t.text[r], '\n')
print(X[r], '\n')

6916 

السديس عن تلقي الملك سلمان لقاح كورونا: قدم للشعب السعودي والعالم درسا  <LF>https://t.co/wImO0WR13q 

['السديس', 'عَن', 'أَلْقَى', 'مَلِك', 'سَلْمان', 'لَقاح', 'كَوَّر', 'قَدَّم', 'شَعْب', 'سَعُودِيّ', 'عالَم', 'دَرْس', '<LF>', '<LINK>'] 



In [15]:
# Print Vocabulary size
st = [word for x in X for word in x]
si = set(st)
print(len(st))
print(len(si))

209605
11771


### Getting the train , test and dev features

In [35]:
train_features_bow,test_features_bow=BOW(train_documents=X,test_documents=X_dev)

In [36]:
train_features_tfidf,test_features_tfidf=TFIDF(train_documents=X,test_documents=X_dev)

In [16]:
train_features_tfidf,test_features_cbow=CBOW(train_documents=X,test_documents=X_dev)

In [None]:
train_features_tfidf,test_features_sg=SG(train_documents=X,test_documents=X_dev)

## Classical model Training 

- Classification Task Results:

|  | SVM | Random Forest | KNN |  Voting system |
| --------------- | --------------- | --------------- | --------------- | --------------- |
| TFIDF |69%  |  |  ||
| BOW | 54%  |   |   | |


- Stance Task Results:

|  | SVM | Random Forest | KNN |  Voting system |
| --------------- | --------------- | --------------- | --------------- | --------------- |
| TFIDF |81%  |  |  ||
| BOW | 80%  |   |   | |

In [37]:
yc_pred=SVMmodel(Xtrain=train_features_tfidf,y_train=yc,X_test=test_features_tfidf)

In [38]:
ys_pred=SVMmodel(Xtrain=train_features_tfidf,y_train=ys,X_test=test_features_tfidf)

In [39]:
classify_accuracy=metrics.accuracy_score(y_true=yc_dev,y_pred=yc_pred)
stance_accuracy=metrics.accuracy_score(y_true=ys_dev,y_pred=ys_pred)

print("The classification accuracy after training on svm on tfidf features : ",classify_accuracy)
print("The stance accuracy after training on svm on tfidf features : ",stance_accuracy)

The classification accuracy after training on svm on tfidf features :  0.697
The stance accuracy after training on svm on tfidf features :  0.816


In [40]:
yc_pred=SVMmodel(Xtrain=train_features_bow,y_train=yc,X_test=test_features_tfidf)

In [41]:
ys_pred=SVMmodel(Xtrain=train_features_bow,y_train=ys,X_test=test_features_tfidf)

In [42]:
classify_accuracy=metrics.accuracy_score(y_true=yc_dev,y_pred=yc_pred)
stance_accuracy=metrics.accuracy_score(y_true=ys_dev,y_pred=ys_pred)

print("The classification accuracy after training on svm on bow features : ",classify_accuracy)
print("The stance accuracy after training on svm on bow features : ",stance_accuracy)

The classification accuracy after training on svm on bow features :  0.549
The stance accuracy after training on svm on bow features :  0.807
