# Imports


In [1]:
# For Data
import numpy as np
import pandas as pd
import re
from datetime import datetime
from tqdm.notebook import tqdm

#  For Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo 
import plotly.graph_objects as go
import plotly.figure_factory as ff
import missingno as msno
from wordcloud import WordCloud
import random

# For models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier


# For NLP
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from spellchecker import SpellChecker

import pickle
from imblearn.over_sampling import SMOTE
from collections import Counter

# For Styling
plt.style.use('fivethirtyeight')

# Downloading periphrals
nltk.download('vader_lexicon')
nltk.download('stopwords')

from preprocessing import *
from plot import *
from feature_extractor import *
from data_balance import *
from model import *

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/mostafawael/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mostafawael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/mostafawael/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mostafawael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
MODELS_PATH = 'out/models/stance/'

# Import the dataset

In [3]:
train_file = 'Dataset/trainDev.csv'
devFile = 'Dataset/ourTest.csv'
train_df = pd.read_csv(train_file)
dev_df = pd.read_csv(devFile)
print(f"Training dataset size = {train_df.shape}")
print(f"Dev dataset size = {dev_df.shape}")

Training dataset size = (7988, 3)
Dev dataset size = (2000, 4)


# Data Preprocessing

In [4]:
# Data cleaning
training_data = cleanData(train_df, 'training', clean = True, clearData = True)
print(f"Cleaned Training dataset size = {training_data.shape}")
# Data cleaning
dev_data = cleanData(dev_df, 'dev', clean = True, clearData = False)
print(f"Cleaned Dev dataset size = {dev_data.shape}")

Cleaned Training dataset size = (7462, 3)
Cleaned Dev dataset size = (2000, 4)


In [5]:
# Some preprocessing steps, like extracting limmitization
training_data = training_data.pipe(processing)    
print(f"Processed Training dataset size = {training_data.shape}")
print(training_data.columns)
# Some preprocessing steps, like extracting limmitization
dev_data = processing(dev_data)   
print(f"Processed dev dataset size = {dev_data.shape}")
print(dev_data.columns)

Processed Training dataset size = (7462, 5)
Index(['text', 'category', 'stance', 'Lemmatization', 'sentiment'], dtype='object')
Processed dev dataset size = (2000, 6)
Index(['id', 'text', 'stance', 'predictions', 'Lemmatization', 'sentiment'], dtype='object')


In [6]:
# Save the out
training_data.to_csv('out/training_data_processed.csv', index=False) # print the df in a csv file
# Save the out
dev_data.to_csv('out/dev_data_processed.csv', index=False) # print the df in a csv file

# Feature Engineering

In [7]:
##  embeddings
w2v_model, bow_model, tfidf_model = get_feature_models(training_data['Lemmatization'].tolist()) #+dev_data['Lemmatization'].tolist()) # use word2vec to extract the word embeddings
training_data['features'] = get_features(w2v_model, bow_model, tfidf_model, training_data['Lemmatization'], 1, 1, 1) # get the word embeddings for each tweet
trainingFeatures = training_data['features'].to_numpy()

##  embeddings
#model = extractWordEmbeddings(dev_data['Lemmatization']) # use word2vec to extract the word embeddings
dev_data['features'] = get_features(w2v_model, bow_model, tfidf_model, dev_data['Lemmatization'], 1, 1, 1) # get the word embeddings for each tweet
devFeatures = dev_data['features'].to_numpy()


In [8]:
print(len(dev_data['features']))
print(len(dev_data['features'][0]))

2000
18272


In [9]:
# XLM embeddings
# features = np.load('out/train_embeddings.npy')
# XLM embeddings
# dev_data['features'] = np.load('out/test_embeddings.npy').tolist()

# Balancing the training dataset

In [10]:
# convert features into 2d array
trainingFeatures = np.array([np.array(xi) for xi in trainingFeatures])
stances = training_data['stance'].to_numpy()
columns = ["f" + str(i + 1) for i in range(len(trainingFeatures[0]))]

df = pd.DataFrame(trainingFeatures, columns=columns)
df = pd.DataFrame(trainingFeatures)
df['stance'] = stances

X_train_balanced, y_train_balanced = balance_data(df)
print("Some notes about dimensions of the data")
print(f"X_train size before cleaning = {trainingFeatures.shape}")
print(f"X_train size = {X_train_balanced.shape}")
print(f"y_train size = {y_train_balanced.shape}")

Before balancing:
Class=2, n=5943 (79.644%)
Class=1, n=1063 (14.246%)
Class=0, n=456 (6.111%)
After balancing:
Class=2, n=5943 (33.333%)
Class=1, n=5943 (33.333%)
Class=0, n=5943 (33.333%)
Some notes about dimensions of the data
X_train size before cleaning = (7462, 18272)
X_train size = (17829, 18272)
y_train size = (17829,)


In [11]:
split = 'n'
if split == 'y':
    # Split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X_train_balanced, y_train_balanced, test_size=0.5, random_state=42)
else:
    X_train, y_train = X_train_balanced, y_train_balanced
    X_test = np.array([np.array(xi) for xi in dev_data['features'].to_numpy()])
    # X_test = np.load('out/test_embeddings.npy')
    y_test = dev_data['stance'].to_numpy()

print(f"X_test size = {X_test.shape}")
print(f"y_test size = {y_test.shape}")
# print unique values in the dataset
print(f"Unique values in the dataset = {np.unique(y_test)}")
print(f"Unique values in the dataset = {np.unique(y_train)}")

X_test size = (2000, 18272)
y_test size = (2000,)
Unique values in the dataset = [-1  0  1]
Unique values in the dataset = [-1  0  1]


# Model Building
**Build a multi-class classifier to predict the category of the tweet**

In [12]:
print(len(dev_data['features']))
print(len(dev_data['features'][0]))
print(len(training_data['features']))
print(len(training_data['features'][0]))

2000
18272
7462
18272


In [18]:
# RandomForestClassifier
clf = RandomForestClassifier(n_estimators=500, max_depth=15, random_state=1, n_jobs=6)
model, report = modelPipeline(X_train, y_train, X_test, y_test, clf, MODELS_PATH+ 'clf.model')
report

              precision    recall  f1-score   support

          -1       0.56      0.28      0.37       619
           0       0.25      0.19      0.22       697
           1       0.43      0.74      0.55       684

    accuracy                           0.41      2000
   macro avg       0.42      0.40      0.38      2000
weighted avg       0.41      0.41      0.38      2000



0.4055

In [None]:
# Xgboost
xgb = XGBClassifier()
_y_train = y_train + 1
_y_test = y_test + 1
model, report = modelPipeline(X_train, _y_train, X_test, _y_test, xgb, MODELS_PATH + 'xgb.model')
report

              precision    recall  f1-score   support

           0       0.79      0.12      0.22       619
           1       0.23      0.10      0.14       697
           2       0.39      0.92      0.55       684

    accuracy                           0.39      2000
   macro avg       0.47      0.38      0.30      2000
weighted avg       0.46      0.39      0.30      2000



0.388

In [None]:
# Naive Bayes
gnb = GaussianNB(var_smoothing=10)
model, report = modelPipeline(X_train, y_train, X_test, y_test, gnb, MODELS_PATH + 'gnb.model')
report

              precision    recall  f1-score   support

          -1       0.31      0.71      0.43       619
           0       0.19      0.09      0.12       697
           1       0.36      0.12      0.19       684

    accuracy                           0.29      2000
   macro avg       0.29      0.31      0.25      2000
weighted avg       0.29      0.29      0.24      2000



0.2935

In [None]:
# SVM
svm = svm.SVC()
model, report = modelPipeline(X_train, y_train, X_test, y_test, svm, MODELS_PATH + 'svm.model')
report

# Auto ML
Check out the [Auto SKlearn](https://automl.github.io/auto-sklearn/master/index.html)

In [None]:
import autosklearn.classification
import sklearn.datasets
import sklearn.metrics
from sklearn.utils.multiclass import type_of_target

In [None]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=1900, # overall time in seconds
    per_run_time_limit=1300, # time per model in seconds
    initial_configurations_via_metalearning=0,
    ensemble_size=10,
    n_jobs=8,
    smac_scenario_args={"runcount_limit": 1},
)
automl.fit(X_train, y_train)
y_pred = automl.predict(X_test)



In [None]:
print(automl.sprint_statistics())
print(automl.leaderboard())

auto-sklearn results:
  Dataset name: b3f45e36-8548-11ed-916f-4fa44fc3bf46
  Metric: accuracy
  Best validation score: 0.917335
  Number of target algorithm runs: 1
  Number of successful target algorithm runs: 1
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0

          rank  ensemble_weight           type      cost   duration
model_id                                                           
2            1              1.0  random_forest  0.082665  66.039781


In [None]:
print("F1 score", sklearn.metrics.f1_score(y_test, y_pred, average='macro')) #  get the Score of the final ensemble

F1 score 0.29443428870259153


In [None]:
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred)) #  get the Score of the final ensemble

Accuracy score 0.376


## LSTM Trial  
###### it's bad one!
### layers need modifications!


In [None]:
from tqdm import tqdm
import torch
from torch import nn

class NLPDataset(torch.utils.data.Dataset):

  def __init__(self, x, y):
    self.x = torch.tensor(x)
    self.y = torch.tensor(y)

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

#####################################################################################################


class LSTM(nn.Module):
  def __init__(self, embedding_dim=100, hidden_size=100,num_layer= 3 , n_classes=3):

    super(LSTM, self).__init__()
    self.hidden_size = hidden_size
    self.num_layer = num_layer
    self.n_classes = n_classes
    self.embedding_dim = embedding_dim

    self.lstm = nn.LSTM(input_size =embedding_dim, hidden_size = hidden_size,num_layers = num_layer ,batch_first=True)
    self.linear = nn.Linear(hidden_size, n_classes)

  def forward(self, emmbeddings):

    lstm_out, state = self.lstm(emmbeddings)
    final_output = self.linear(lstm_out)
    
    return final_output

        

In [None]:
train_dataset = NLPDataset(X_train.tolist(), (y_train+1).tolist())
dev_dataset = NLPDataset(X_test.tolist(), (y_test+1).tolist())
model = LSTM()
print(model)

In [None]:
def train(model, train_dataset, batch_size=500, epochs=100, learning_rate=0.01):

  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in (train_dataloader):

      train_input, train_label = train_input.to(device), train_label.to(device)

      output = model(train_input)
      
      batch_loss = criterion(output.view(-1, output.shape[-1]), train_label.view(-1))

      total_loss_train += batch_loss

      acc = (output.argmax(dim=-1) == train_label).sum().item()
      total_acc_train += acc

      optimizer.zero_grad()

      batch_loss.backward()

      optimizer.step()
      
    epoch_loss = total_loss_train / (len(train_dataset))

    epoch_acc = total_acc_train / (len(train_dataset))
    if (epoch_num+1) % 10 == 0:
      print(
          f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
          | Train Accuracy: {epoch_acc}\n')

def evaluate(model, test_dataset):

  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=len(test_dataset))

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0
  
  with torch.no_grad():

      test_input, test_label = next(iter(test_dataloader))

      test_input = test_input.to(device)
      test_label = test_label.to(device)

      output = model(test_input)

      # classification report
      report = (classification_report(test_label.cpu(), output.argmax(dim=-1).cpu()))
      print(report)





In [None]:
# it takes a while to train especially if you dont have a GPU 
# Advice: if you dont have a GPU, dont run this cell
HAVE_A_GPU = False
if HAVE_A_GPU:
    train(model, train_dataset, batch_size=500, epochs=500, learning_rate=0.01)

In [None]:
evaluate(model, dev_dataset)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        68
           1       0.00      0.00      0.00       121
           2       0.81      1.00      0.89       790

    accuracy                           0.81       979
   macro avg       0.27      0.33      0.30       979
weighted avg       0.65      0.81      0.72       979

