In [1]:
# !pip install -r requirements.txt
# !camel_data light
# import nltk
# nltk.download('stopwords')

In [2]:
# built in modules
import re
import random
import numpy as np
import pandas as pd
from sklearn import metrics 

# .py files 
from preprocess import Preprocess
from feature_extraction import BOW
from feature_extraction import TFIDF
from feature_extraction import CBOW
from feature_extraction import SG

from classical_models import SVMmodel
# Can use this library to reload a specific module if the notebook can't see changes in the imported module
import importlib

Stance detection labels meaning is as follows:
1. Positive (1): means that the tweet author encourages and supports vaccination.
2. Negative (-1): means that the tweet author refuses vaccination.
3. Neutral (0): means that the tweet neither supports nor refuses vaccination.

Category labels meaning is as follows:
1. Info_News: Information about vaccination.
2. Celebrities: mentioning celebrities taking vaccinations.
3. Plan: Governmental plan or progress of vaccination.
4. Request: Requests from governments regarding the vaccination process.
5. Rumor: the tweet is a rumor.
6. Advice: Advice related to the virus or the vaccination
7. Restriction: Restrictions due to the virus e.g. traveling.
8. Personal: Personal opinion or story about vaccination.
9. Unrelated: Unrelated to vaccination.
10.Others: Vaccination related but not one of the above.

In [3]:
t = pd.read_csv('./Dataset/train.csv')
d = pd.read_csv('./Dataset/dev.csv')


In [4]:
preprocess = Preprocess()
#
categories = ["info_news", "celebrity", "plan", "requests", "rumors", "advice", "restrictions", "personal", "unrelated", "others"]

Emojis:  True
Lemmatizor:  camel


In [5]:
# we have 9 categories so we have an array of len 9 for each y
def encode_category(y):
    '''
    Input: y a list of string labels for the category of each document 
    Output: a list of encoded 10 sized array for the category of each doc 
            for "other" category , it has an array =[0 0 0 0 0 0 0 0 0 1] 
    '''
    return [categories.index(ele) for ele in y]

### Understanding the Data
- 80% tweets are positive (class 1), the rest are neutral and negative.
- 50% tweets belongs to info_news category.

In [6]:
# analyze dataset
print(t.info())
# d
# print(d.info())

# count for each label
print("####################################")
print("counts for each lablel :")
print(t['category'].value_counts(normalize=True))
# same for d
# print(d['category'].value_counts(normalize=True))

# count for stance labels
print("####################################")
print("counts for each stance label :")
print(t['stance'].value_counts(normalize=True))
# d
print(d['stance'].value_counts(normalize=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6988 entries, 0 to 6987
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      6988 non-null   object
 1   category  6988 non-null   object
 2   stance    6988 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 163.9+ KB
None
####################################
counts for each lablel :
info_news       0.517459
personal        0.146680
celebrity       0.139525
plan            0.086720
unrelated       0.046222
others          0.023898
requests        0.016027
rumors          0.011305
advice          0.009588
restrictions    0.002576
Name: category, dtype: float64
####################################
counts for each stance label :
 1    0.792501
 0    0.144820
-1    0.062679
Name: stance, dtype: float64
 1    0.804
 0    0.126
-1    0.070
Name: stance, dtype: float64


In [6]:
X = t['text']
ys = t['stance']
yc = t['category']

In [7]:
print("####################################")
print("duplicated rows :")
print(t[t.duplicated()])

####################################
duplicated rows :
                                                   text   category  stance
529   اللقاح لازم ينحفظ بدرجة حرارة ٧٠ دون الصفر وال...  info_news       1
740   هناك مصادر لقاح أثبتت نجاحها غير أمريكا ويمكن ...  info_news       1
963   لو كان اللقاح الأمريكي أمن ومفيد لما أصبحت الو...   personal      -1
977   اللقاح الأمريكي لم ينجح في الاختبارات السريرية...  info_news      -1
1684    #نريد_لقاح_آمن حتى نأمن صحة اطفالنا في المستقبل   requests       1
1878  هناك مصادر لقاح أثبتت نجاحها غير أمريكا ويمكن ...  info_news       1
2477  محمد بن زايد: الإمارات قدمت أكثر من مليون و 27...  info_news       1
2644         نريد التخلص من كورونا لكننا #نريد_لقاح_آمن   requests       1
2646  في ظل تسابق دول العالم على تطعيم شعوبها.. تَعَ...  info_news       1
2786  لو كان اللقاح الأمريكي أمن ومفيد لما أصبحت الو...   personal      -1
3231  #نريد_لقاح_آمن حتى ترجع الحياة الى طبيعتها بعد...  info_news       1
3728    #نريد_لقاح_آمن حتى نأمن صحة اطفالنا ف

In [8]:
X = t['text']
ys = t['stance']
yc = t['category']

In [9]:
print(yc[0:10])
yc=encode_category(yc)
print(yc[0:10])

0    celebrity
1    info_news
2    info_news
3    celebrity
4     personal
5    info_news
6    info_news
7     personal
8    unrelated
9    info_news
Name: category, dtype: object
[1, 0, 0, 1, 7, 0, 0, 7, 8, 0]


In [10]:
X_dev=d['text']
ys_dev=d['stance']
yc_dev=d['category']

In [11]:
yc_dev=encode_category(yc_dev)

In [12]:
# Preprocess the test data
X = X.apply(preprocess.do_all)

In [13]:
# Preprocess the dev data
X_dev = X_dev.apply(preprocess.do_all)

In [14]:
r = random.randint(0, len(X))
# r = 900
print(r, '\n')
print(t.text[r], '\n')
print(X[r], '\n')

6916 

السديس عن تلقي الملك سلمان لقاح كورونا: قدم للشعب السعودي والعالم درسا  <LF>https://t.co/wImO0WR13q 

['السديس', 'عَن', 'أَلْقَى', 'مَلِك', 'سَلْمان', 'لَقاح', 'كَوَّر', 'قَدَّم', 'شَعْب', 'سَعُودِيّ', 'عالَم', 'دَرْس', '<LF>', '<LINK>'] 



In [15]:
r = random.randint(0, len(X_dev))
r = 150
# r = 738
print(r, '\n')
print(d.text[r], '\n')
print(X_dev[r], '\n')

150 

تساؤلات آخر ليل <LF>اذا ما كفانا لقاح الكورونا ...بزيدولو مي ؟؟ 😅<LF>#هبل 

['تَساؤُل', 'آخَر', 'كَفَى', 'لَقاح', 'الكورونا', 'بزيدولو', 'مَيّ', '<LF>', '<smiling_face_with_open_mouth_&_cold_sweat>', '<LF>', 'أَهْبَل', 'أَهْبَل'] 



In [16]:
# Print Vocabulary size
lst = [word for x in X for word in x]
vocab = set(lst)
print(len(lst))
print(len(vocab))

157990
11755


### Getting the train , test and dev features

In [17]:
train_features_bow,test_features_bow=BOW(train_documents=X,test_documents=X_dev)
#
assert train_features_bow.shape[0] == len(X)
assert train_features_bow.shape[1] == len(vocab)

In [18]:
train_features_tfidf,test_features_tfidf=TFIDF(train_documents=X,test_documents=X_dev)
#
assert train_features_tfidf.shape[0] == len(X)
assert train_features_tfidf.shape[1] == len(vocab)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer =TfidfVectorizer(analyzer=lambda x: x)
train_features= vectorizer.fit_transform(X)
# return train_features.toarray(), test_features.toarray()

# print 10 words with highest tfidf values for each CATEGORY
feature_names = vectorizer.get_feature_names_out()
for i, category in enumerate(categories):
    lst = [0] *len(feature_names)
    for row in train_features[np.array(yc) == i].toarray():
        lst = [x + y for x, y in zip(lst, row)]
    # top 10 words
    top20 = np.argsort(lst)[::-1][:20]
    # print names and values
    print(category)
    print([(feature_names[j], lst[j]) for j in top20])
    print()

info_news
[('<LF>', 294.27850738746736), ('كَوَّر', 226.6223101860665), ('لَقاح', 216.47097715595743), ('<NUM>', 201.26844710071643), ('<LINK>', 177.23343555817593), ('جُرْعَة', 86.49775703928718), ('فَيْرُوس', 83.84257450743797), ('عاجِل', 82.22749840409844), ('صِحَّة', 81.42622511965635), ('تَطْعِيم', 77.1925725400968), ('فايزر', 75.3925624882043), ('كوفيد', 60.67509055645215), ('مِلْيُون', 58.68444790998813), ('عَرَبِيّ', 56.742524008525024), ('ضِدّ', 55.955724898757246), ('دَوْلَة', 52.39212299688869), ('سَعُودِيّ', 52.091068021707), ('أَمارَة', 50.663568829629476), ('يَوْم', 48.48224587671623), ('أَراد', 46.33697390658954)]

celebrity
[('تَلَقَّى', 116.90073580612544), ('<LF>', 96.44052474656165), ('كَوَّر', 92.25816784582415), ('مَلِك', 80.05493094970217), ('خادِم', 77.22553708457893), ('حَرَم', 76.63791147518536), ('شَرِيف', 70.9978018609423), ('جُرْعَة', 69.98564104042319), ('<LINK>', 69.63818048288577), ('لَقاح', 67.28260845292984), ('سَلْمان', 41.43832195114347), ('اللَّه', 3

In [16]:
train_features_tfidf,test_features_cbow=CBOW(train_documents=X,test_documents=X_dev)

In [None]:
train_features_tfidf,test_features_sg=SG(train_documents=X,test_documents=X_dev)

## Classical model Training 

- Classification Task Results:

|  | SVM | Random Forest | KNN |  Voting system |
| --------------- | --------------- | --------------- | --------------- | --------------- |
| TFIDF |69%  |  |  ||
| BOW | 54%  |   |   | |


- Stance Task Results:

|  | SVM | Random Forest | KNN |  Voting system |
| --------------- | --------------- | --------------- | --------------- | --------------- |
| TFIDF |81%  |  |  ||
| BOW | 80%  |   |   | |

In [20]:
train_features = np.concatenate((train_features_bow, train_features_tfidf), axis=1)
test_features = np.concatenate((test_features_bow, test_features_tfidf), axis=1)
#
print(train_features_bow.shape)
print(train_features.shape)

(6988, 11755)
(6988, 23510)


In [21]:
# clss = [NBmodel, KNNmodelو SVMmodel]
# clss = [KNNmodel]
clss = [LRmodel]
#
for cls in clss:
    #
    N = 1000
    N = len(train_features)
    yc_pred = cls(Xtrain=train_features[:N], y_train=yc[:N], X_test=test_features)
    # ys_pred = cls(Xtrain=train_features, y_train=ys, X_test=test_features)
    #
    print('========= Category =========')
    print(metrics.classification_report(y_true=yc_dev,y_pred=yc_pred))
    print('========= Stance =========')
    # print(metrics.classification_report(y_true=ys_dev,y_pred=ys_pred))

# classify_accuracy=metrics.accuracy_score(y_true=yc_dev,y_pred=yc_pred)
# stance_accuracy=metrics.accuracy_score(y_true=ys_dev,y_pred=ys_pred)

              precision    recall  f1-score   support

           0       0.69      0.78      0.73       545
           1       0.85      0.81      0.83       145
           2       0.22      0.16      0.19        82
           3       0.33      0.10      0.15        20
           4       0.00      0.00      0.00        15
           5       0.50      0.10      0.17        10
           6       0.00      0.00      0.00         2
           7       0.47      0.51      0.49       128
           8       0.50      0.42      0.45        36
           9       0.12      0.06      0.08        17

    accuracy                           0.64      1000
   macro avg       0.37      0.29      0.31      1000
weighted avg       0.61      0.64      0.62      1000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
yc_pred=SVMmodel(Xtrain=train_features_bow,y_train=yc,X_test=test_features_tfidf)

In [None]:
ys_pred=SVMmodel(Xtrain=train_features_bow,y_train=ys,X_test=test_features_tfidf)

In [None]:
classify_accuracy=metrics.accuracy_score(y_true=yc_dev,y_pred=yc_pred)
stance_accuracy=metrics.accuracy_score(y_true=ys_dev,y_pred=ys_pred)

print("The classification accuracy after training on svm on bow features : ",classify_accuracy)
print("The stance accuracy after training on svm on bow features : ",stance_accuracy)