# Imports


In [16]:
# For Data
import numpy as np
import pandas as pd
import re
from datetime import datetime
from tqdm.notebook import tqdm

#  For Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo 
import plotly.graph_objects as go
import plotly.figure_factory as ff
import missingno as msno
from wordcloud import WordCloud
import random

# For models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier


# For NLP
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from spellchecker import SpellChecker

import pickle
from imblearn.over_sampling import SMOTE
from collections import Counter

# For Styling
plt.style.use('fivethirtyeight')

# Downloading periphrals
nltk.download('vader_lexicon')
nltk.download('stopwords')

from preprocessing import *
from plot import *
from feature_extractor import *
from data_balance import *
from model import *

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/mostafawael/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mostafawael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import the dataset

In [17]:
train_file = 'Dataset/train.csv'
devFile = 'Dataset/dev.csv'
train_df = pd.read_csv(train_file)
dev_df = pd.read_csv(devFile)
print(f"Training dataset size = {train_df.shape}")
print(f"Dev dataset size = {dev_df.shape}")

Training dataset size = (6988, 3)
Dev dataset size = (1000, 3)


# Data Preprocessing

In [18]:
# Data cleaning
training_data = cleanData(train_df, 'training', clean = False, clearData = False)
print(f"Cleaned Training dataset size = {training_data.shape}")
# Data cleaning
dev_data = cleanData(dev_df, 'dev', clean = False, clearData = False)
print(f"Cleaned Dev dataset size = {dev_data.shape}")

Cleaned Training dataset size = (6988, 3)
Cleaned Dev dataset size = (1000, 3)


In [19]:
# Some preprocessing steps, like extracting limmitization
training_data = training_data.pipe(processing)    
print(f"Processed Training dataset size = {training_data.shape}")
print(training_data.columns)
# Some preprocessing steps, like extracting limmitization
dev_data = processing(dev_data)   
print(f"Processed dev dataset size = {dev_data.shape}")
print(dev_data.columns)

Processed Training dataset size = (6988, 5)
Index(['text', 'category', 'stance', 'Lemmatization', 'sentiment'], dtype='object')
Processed dev dataset size = (1000, 5)
Index(['text', 'category', 'stance', 'Lemmatization', 'sentiment'], dtype='object')


In [20]:
# Save the out
training_data.to_csv('out/training_data_processed.csv', index=False) # print the df in a csv file
# Save the out
dev_data.to_csv('out/dev_data_processed.csv', index=False) # print the df in a csv file

# Feature Engineering

In [21]:
## Word2Vec embeddings
model = extractWordEmbeddings(training_data['Lemmatization']) # use word2vec to extract the word embeddings
training_data['features'] = getTweetsEmbeddings(model, training_data['Lemmatization']) # get the word embeddings for each tweet
trainingFeatures = training_data['features'].to_numpy()

## Word2Vec embeddings
model = extractWordEmbeddings(dev_data['Lemmatization']) # use word2vec to extract the word embeddings
dev_data['features'] = getTweetsEmbeddings(model, dev_data['Lemmatization']) # get the word embeddings for each tweet
devFeatures = dev_data['features'].to_numpy()


In [22]:
# XLM embeddings
# features = np.load('out/train_embeddings.npy')
# XLM embeddings
# dev_data['features'] = np.load('out/test_embeddings.npy').tolist()

# Balancing the training dataset

In [23]:
# convert features into 2d array
trainingFeatures = np.array([np.array(xi) for xi in trainingFeatures])
stances = training_data['stance'].to_numpy()
columns = ["f" + str(i + 1) for i in range(len(trainingFeatures[0]))]

df = pd.DataFrame(trainingFeatures, columns=columns)
df = pd.DataFrame(trainingFeatures)
df['stance'] = stances

X_train, y_train = balance_data(df)
print("Some notes about dimensions of the data")
print(f"X_train size before cleaning = {trainingFeatures.shape}")
print(f"X_train size = {X_train.shape}")
print(f"y_train size = {y_train.shape}")

Before balancing:
Class=2, n=5538 (79.250%)
Class=1, n=1012 (14.482%)
Class=0, n=438 (6.268%)
After balancing:
Class=2, n=5538 (33.333%)
Class=1, n=5538 (33.333%)
Class=0, n=5538 (33.333%)
Some notes about dimensions of the data
X_train size before cleaning = (6988, 100)
X_train size = (16614, 100)
y_train size = (16614,)


In [26]:
split = False
if split:
    # Split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(trainingFeatures, stances, test_size=0.2, random_state=42)
else:
    X_test = np.array([np.array(xi) for xi in dev_data['features'].to_numpy()])
    # X_test = np.load('out/test_embeddings.npy')
    y_test = dev_data['stance'].to_numpy()

print(f"X_test size = {X_test.shape}")
print(f"y_test size = {y_test.shape}")

X_test size = (1000, 100)
y_test size = (1000,)


# Model Building
**Build a multi-class classifier to predict the category of the tweet**

In [27]:
# RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=0)
model, report = modelPipeline(X_train, y_train, X_test, y_test, clf, 'out/models/clf.model')
report

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        70
           0       0.00      0.00      0.00       126
           1       0.79      0.81      0.80       804
           2       0.00      0.00      0.00         0

    accuracy                           0.65      1000
   macro avg       0.20      0.20      0.20      1000
weighted avg       0.63      0.65      0.64      1000



0.648