# Imports


In [92]:
# For Data
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from datetime import datetime
from tqdm.notebook import tqdm

#  For Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo 
import plotly.graph_objects as go
import plotly.figure_factory as ff
import missingno as msno
from wordcloud import WordCloud
import random 

# For NLP
import nltk
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

from sklearn.model_selection import train_test_split

# For Styling
plt.style.use('fivethirtyeight')

# Downloading periphrals
nltk.download('vader_lexicon')
nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/mostafawael/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mostafawael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import the dataset

In [94]:
file = '../Dataset/train.csv'
df = pd.read_csv(file)
devFile = '../Dataset/dev.csv'
dev_df = pd.read_csv(devFile)
print(f"Train dataset size = {df.shape}")
print(f"Dev dataset size = {dev_df.shape}")

Train dataset size = (6988, 3)
Dev dataset size = (1000, 3)


# Explore the data

# Data Preprocessing

In [95]:
def CleanTweets(df):
    data = df.copy() # Copying the dataset
    ##### Related to the tweets #####
    # Remove twitter handlers
    data.text = data.text.apply(lambda x:re.sub('@[^\s]+','',x))
    # Remove digits
    data.text = data.text.apply(lambda x:re.sub(r'\d+','',x))
    # Remove all the (special characters, punctuations, and emojis)
    data.text = data.text.apply(lambda x:' '.join(re.findall(r'\w+', x)))
    # Remove all english alphabets
    data.text = data.text.apply(lambda x:re.sub(r'[a-zA-Z]', '', x))
    # Substituting multiple spaces with single space
    data.text = data.text.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
    # Remove all the empty spaces
    data.text = data.text.apply(lambda x: x.strip())
    # Remove all the stopwords
    data.text = data.text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('arabic'))]))
    ##### Related to the dataset ##### 
    # Remove all the empty rows
    data = data[data.text != '']
    # Removing the duplicated rows
    data = data.drop_duplicates()
    # Removing the duplicated tweets
    data = data.drop_duplicates(subset=['text'])
    # Removing the tweets with less than 10 characters
    data = data[data.text.str.len() > 10]
    # Removing the tweets with less than 4 words
    data = data[data.text.str.split().str.len() > 3]
    # Resetting the index, why? because we removed some rows
    data = data.reset_index(drop=True)
    return data

In [96]:
def cleanData(df, name, clean = False):  # If you want to clean the data, set it to True. Default is False to save time
    if clean:
        data  = CleanTweets(df)
        data.to_csv('../out/'+name+'_cleaned_data.csv', index=False) # print the df in a csv file
        data.head() # Displaying the dataset
    else:
        # Read the cleaned data
        data = pd.read_csv('../out/'+name+'_cleaned_data.csv')
    return data
data = cleanData(df, 'train', clean = False)
print(f"Cleaned dataset size = {data.shape}")

Cleaned dataset size = (6557, 3)


# More data preprocessing
**Extracting any required fields from the data**

In [97]:
def processing(data):
    # Apply Lemmatization to the tweets
    from nltk.stem.isri import ISRIStemmer # Arabic Lemmatization
    st = ISRIStemmer()
    data['Lemmatization'] = data.text.apply(lambda x: ''.join([st.stem(word) for word in x.split()]))

    # Extract Sentiment Values for each tweet 
    data['sentiment'] = data['stance'].apply(lambda x: 
                                                'positive' if x == 1 
                                                else ('negative' if x == -1 
                                                else 'neutral' )); # Extracting the overall sentiment
    # Useful Information
    data['words'] = data.text.apply(lambda x:re.findall(r'\w+', x ))
    data['errors'] = data.words.apply(spell.unknown)
    data['errorsCount'] = data.errors.apply(len)
    data['sentenceLength'] = data.text.apply(len)

    return data
data = data.pipe(processing)    
data.head() # show the dataset
data.to_csv('../out/processed_data.csv', index=False) # print the df in a csv file
print(f"Processed dataset size = {data.shape}")

Processed dataset size = (6557, 9)


# Feature Engineering

In [102]:
# Extract word embeddings for each tweet
def extractWordEmbeddings(data):
    from gensim.models import Word2Vec
    model = Word2Vec(data, min_count=1, window=5, sg=0)
    model.save('../out/word2vec.model')
    return model
model = extractWordEmbeddings(data['Lemmatization'])

# use the model to extract word embeddings
def getWordEmbeddings(model, word):
    return model.wv[word]

# get the word embeddings for each tweet
def getTweetsEmbeddings(model, tweets):
    return tweets.apply(lambda x: np.mean([getWordEmbeddings(model, word) for word in x], axis=0))

# get the word embeddings for each tweet
data['features'] = getTweetsEmbeddings(model, data['Lemmatization'])


In [105]:
from data_balance import *

# convert features into 2d array
trainingFeatures = np.array([np.array(xi) for xi in data['features']])
stances = data['stance'].to_numpy()
columns = ["f" + str(i + 1) for i in range(len(trainingFeatures[0]))]

df = pd.DataFrame(trainingFeatures, columns=columns)
df = pd.DataFrame(trainingFeatures)
df['stance'] = stances

X, y = balance_data(df)
print("Some notes about dimensions of the data")
print(f"X_train size before cleaning = {trainingFeatures.shape}")
print(f"X_train size = {X.shape}")
print(f"y_train size = {y.shape}")

Before balancing:
Class=2, n=5207 (79.411%)
Class=1, n=954 (14.549%)
Class=0, n=396 (6.039%)
After balancing:
Class=2, n=5207 (33.333%)
Class=1, n=5207 (33.333%)
Class=0, n=5207 (33.333%)
Some notes about dimensions of the data
X_train size before cleaning = (6557, 100)
X_train size = (15621, 100)
y_train size = (15621,)


# Used Features

In [107]:
def getFeatures():
    return np.load('../out/balanced_data_stances/features.npy', allow_pickle=True)

def getStances():
    return np.load('../out/balanced_data_stances/stances.npy', allow_pickle=True)


# Prepare the dev set

In [108]:
# dev_data = cleanData(dev_df, 'dev', clean =False)
# dev_data = processing(dev_data)   
# model = extractWordEmbeddings(dev_data['Lemmatization'])
# dev_data['features'] = getTweetsEmbeddings(model, dev_data['Lemmatization'])


# Splitting the data


In [117]:
# X = getFeatures()
# y = getStances()

# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (1562, 100)
X_test shape: (14059, 100)


# Model Building
**Build a multi-class classifier to predict the category of the tweet**

In [110]:
def trainModel(X_train, y_train, model):
    model.fit(X_train, y_train)
    return model
def testModel(X_test, y_test, model):
    y_pred = model.predict(X_test)
    return y_pred
def evaluateModel(y_test, y_pred):
    from sklearn.metrics import classification_report
    print(classification_report(y_test, y_pred))
    return classification_report(y_test, y_pred, output_dict=True)['accuracy']
def saveModel(model, model_name):
    import pickle
    pickle.dump(model, open(model_name, 'wb'))
    return model_name
def loadModel(model_name):
    import pickle
    return pickle.load(open(model_name, 'rb'))

def modelPipeline(X_train, y_train, X_test, y_test, model, model_name):
    model = trainModel(X_train, y_train, model)
    y_pred = testModel(X_test, y_test, model)
    report = evaluateModel(y_test, y_pred)
    saveModel(model, model_name)
    return model, report

In [118]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=500, max_depth=15, random_state=0)
model, report = modelPipeline(X_train, y_train, X_test, y_test, clf, '../out/clf.model')
report

              precision    recall  f1-score   support

          -1       0.74      0.83      0.78      4685
           0       0.66      0.64      0.65      4698
           1       0.68      0.61      0.64      4676

    accuracy                           0.70     14059
   macro avg       0.69      0.70      0.69     14059
weighted avg       0.69      0.70      0.69     14059



0.6959243189416032

In [119]:
# Xgboost
xgb = XGBClassifier()
_y_train = y_train + 1
_y_test = y_test + 1
model, report = modelPipeline(X_train, _y_train, X_test, _y_test, xgb, '../out/xgb.model')
report

              precision    recall  f1-score   support

           0       0.74      0.83      0.78      4685
           1       0.66      0.67      0.66      4698
           2       0.69      0.60      0.64      4676

    accuracy                           0.70     14059
   macro avg       0.70      0.70      0.69     14059
weighted avg       0.70      0.70      0.69     14059



0.6973468952272566

In [None]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB(var_smoothing=10)
model, report = modelPipeline(X_train, y_train, X_test, y_test, gnb, '../out/gnb.model')
report

              precision    recall  f1-score   support

           0       0.35      0.91      0.51       264
           1       0.00      0.00      0.00       246
           2       0.51      0.19      0.28       272

    accuracy                           0.37       782
   macro avg       0.29      0.37      0.26       782
weighted avg       0.30      0.37      0.27       782



0.37468030690537085

In [None]:
# SVM
from sklearn import svm
svm = svm.SVC()
model, report = modelPipeline(X_train, y_train, X_test, y_test, svm, 'out/svm.model')
report

              precision    recall  f1-score   support

           0       0.52      0.62      0.57       264
           1       0.51      0.52      0.51       246
           2       0.66      0.53      0.58       272

    accuracy                           0.55       782
   macro avg       0.56      0.55      0.55       782
weighted avg       0.56      0.55      0.56       782



0.5549872122762148

# Auto ML
Check out the [Auto SKlearn](https://automl.github.io/auto-sklearn/master/index.html)

In [None]:
import sklearn.datasets
import sklearn.metrics
from sklearn.utils.multiclass import type_of_target

import autosklearn.classification


In [None]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=1900, # overall time in seconds
    per_run_time_limit=1300, # time per model in seconds
    initial_configurations_via_metalearning=0,
    ensemble_size=10,
    n_jobs=8,
    smac_scenario_args={"runcount_limit": 1},
)
automl.fit(X_train, y_train)
y_pred = automl.predict(X_test)



In [None]:
print(automl.sprint_statistics())
print(automl.leaderboard())


auto-sklearn results:
  Dataset name: 1db8f54a-83b4-11ed-a009-09f3292e7554
  Metric: accuracy
  Best validation score: 0.918113
  Number of target algorithm runs: 1
  Number of successful target algorithm runs: 1
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0

          rank  ensemble_weight           type      cost   duration
model_id                                                           
2            1              1.0  random_forest  0.081887  52.255295


In [None]:
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred)) #  get the Score of the final ensemble


Accuracy score 0.9168797953964194


# Reference

1. https://www.kaggle.com/code/wonduk/text-clustering-pca-eda-on-covid19-dataset
2. https://www.kaggle.com/code/haefatim/pfizer-tweets-eda-sentiment-analysis#%23-Reference