# Imports


In [1]:
# For Data
import numpy as np
import pandas as pd
import re
from datetime import datetime
from tqdm.notebook import tqdm

#  For Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo 
import plotly.graph_objects as go
import plotly.figure_factory as ff
import missingno as msno
from wordcloud import WordCloud
import random

# For models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier


# For NLP
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from spellchecker import SpellChecker

import pickle
from imblearn.over_sampling import SMOTE
from collections import Counter

# For Styling
plt.style.use('fivethirtyeight')

# Downloading periphrals
nltk.download('vader_lexicon')
nltk.download('stopwords')

from preprocessing import *
from plot import *
from feature_extractor import *
from data_balance import *
from model import *

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/robert/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /home/robert/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/robert/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /home/robert/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import the dataset

In [2]:
train_file = 'Dataset/train.csv'
devFile = 'Dataset/dev.csv'
train_df = pd.read_csv(train_file)
dev_df = pd.read_csv(devFile)
print(f"Training dataset size = {train_df.shape}")
print(f"Dev dataset size = {dev_df.shape}")

Training dataset size = (6988, 3)
Dev dataset size = (1000, 3)


# Data Preprocessing

In [3]:
# Data cleaning
training_data = cleanData(train_df, 'training', clean = False, clearData = True)
print(f"Cleaned Training dataset size = {training_data.shape}")
# Data cleaning
dev_data = cleanData(dev_df, 'dev', clean = False, clearData = True)
print(f"Cleaned Dev dataset size = {dev_data.shape}")

Cleaned Training dataset size = (6557, 3)
Cleaned Dev dataset size = (979, 3)


In [4]:
# Some preprocessing steps, like extracting limmitization
training_data = training_data.pipe(processing)    
print(f"Processed Training dataset size = {training_data.shape}")
print(training_data.columns)
# Some preprocessing steps, like extracting limmitization
dev_data = processing(dev_data)   
print(f"Processed dev dataset size = {dev_data.shape}")
print(dev_data.columns)

Processed Training dataset size = (6557, 5)
Index(['text', 'category', 'stance', 'Lemmatization', 'sentiment'], dtype='object')
Processed dev dataset size = (979, 5)
Index(['text', 'category', 'stance', 'Lemmatization', 'sentiment'], dtype='object')


In [5]:
# Save the out
training_data.to_csv('out/training_data_processed.csv', index=False) # print the df in a csv file
# Save the out
dev_data.to_csv('out/dev_data_processed.csv', index=False) # print the df in a csv file

# Feature Engineering

In [6]:
## Word2Vec embeddings
model = extractWordEmbeddings(training_data['Lemmatization'].tolist()+dev_data['Lemmatization'].tolist()) # use word2vec to extract the word embeddings
training_data['features'] = getTweetsEmbeddings(model, training_data['Lemmatization']) # get the word embeddings for each tweet
trainingFeatures = training_data['features'].to_numpy()

## Word2Vec embeddings
#model = extractWordEmbeddings(dev_data['Lemmatization']) # use word2vec to extract the word embeddings
dev_data['features'] = getTweetsEmbeddings(model, dev_data['Lemmatization']) # get the word embeddings for each tweet
devFeatures = dev_data['features'].to_numpy()


In [7]:
# XLM embeddings
# features = np.load('out/train_embeddings.npy')
# XLM embeddings
# dev_data['features'] = np.load('out/test_embeddings.npy').tolist()

# Balancing the training dataset

In [8]:
# convert features into 2d array
trainingFeatures = np.array([np.array(xi) for xi in trainingFeatures])
stances = training_data['stance'].to_numpy()
columns = ["f" + str(i + 1) for i in range(len(trainingFeatures[0]))]

df = pd.DataFrame(trainingFeatures, columns=columns)
df = pd.DataFrame(trainingFeatures)
df['stance'] = stances

X_train_balanced, y_train_balanced = balance_data(df)
print("Some notes about dimensions of the data")
print(f"X_train size before cleaning = {trainingFeatures.shape}")
print(f"X_train size = {X_train_balanced.shape}")
print(f"y_train size = {y_train_balanced.shape}")

Before balancing:
Class=2, n=5207 (79.411%)
Class=1, n=954 (14.549%)
Class=0, n=396 (6.039%)
After balancing:
Class=2, n=5207 (33.333%)
Class=1, n=5207 (33.333%)
Class=0, n=5207 (33.333%)
Some notes about dimensions of the data
X_train size before cleaning = (6557, 100)
X_train size = (15621, 100)
y_train size = (15621,)


In [9]:
split = 'n'
if split == 'y':
    # Split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X_train_balanced, y_train_balanced, test_size=0.5, random_state=42)
else:
    X_train, y_train = X_train_balanced, y_train_balanced
    X_test = np.array([np.array(xi) for xi in dev_data['features'].to_numpy()])
    # X_test = np.load('out/test_embeddings.npy')
    y_test = dev_data['stance'].to_numpy()

print(f"X_test size = {X_test.shape}")
print(f"y_test size = {y_test.shape}")
# print unique values in the dataset
print(f"Unique values in the dataset = {np.unique(y_test)}")
print(f"Unique values in the dataset = {np.unique(y_train)}")

X_test size = (979, 100)
y_test size = (979,)
Unique values in the dataset = [-1  0  1]
Unique values in the dataset = [-1  0  1]


# Model Building
**Build a multi-class classifier to predict the category of the tweet**

In [10]:
# RandomForestClassifier
clf = RandomForestClassifier(n_estimators=500, max_depth=15, random_state=0)
model, report = modelPipeline(X_train, y_train, X_test, y_test, clf, 'out/models/clf.model')
report

              precision    recall  f1-score   support

          -1       0.20      0.34      0.25        68
           0       0.36      0.54      0.43       121
           1       0.90      0.78      0.84       790

    accuracy                           0.72       979
   macro avg       0.49      0.55      0.51       979
weighted avg       0.79      0.72      0.75       979



0.7201225740551583

In [11]:
# Xgboost
xgb = XGBClassifier()
_y_train = y_train + 1
_y_test = y_test + 1
model, report = modelPipeline(X_train, _y_train, X_test, _y_test, xgb, 'out/models/xgb.model')
report

              precision    recall  f1-score   support

           0       0.23      0.29      0.26        68
           1       0.34      0.46      0.39       121
           2       0.90      0.82      0.86       790

    accuracy                           0.74       979
   macro avg       0.49      0.53      0.50       979
weighted avg       0.78      0.74      0.76       979



0.7425944841675178

In [12]:
# Naive Bayes
gnb = GaussianNB(var_smoothing=10)
model, report = modelPipeline(X_train, y_train, X_test, y_test, gnb, 'out/models/gnb.model')
report

              precision    recall  f1-score   support

          -1       0.08      0.66      0.15        68
           0       0.31      0.46      0.37       121
           1       0.95      0.31      0.46       790

    accuracy                           0.35       979
   macro avg       0.44      0.48      0.33       979
weighted avg       0.81      0.35      0.43       979



0.34933605720122574

In [13]:
# SVM
svm = svm.SVC()
model, report = modelPipeline(X_train, y_train, X_test, y_test, svm, 'out/models/svm.model')
report

              precision    recall  f1-score   support

          -1       0.12      0.51      0.19        68
           0       0.29      0.51      0.37       121
           1       0.95      0.57      0.71       790

    accuracy                           0.56       979
   macro avg       0.45      0.53      0.43       979
weighted avg       0.81      0.56      0.64       979



0.5597548518896833

# Auto ML
Check out the [Auto SKlearn](https://automl.github.io/auto-sklearn/master/index.html)

In [14]:
import autosklearn.classification
import sklearn.datasets
import sklearn.metrics
from sklearn.utils.multiclass import type_of_target

In [15]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=1900, # overall time in seconds
    per_run_time_limit=1300, # time per model in seconds
    initial_configurations_via_metalearning=0,
    ensemble_size=10,
    n_jobs=8,
    smac_scenario_args={"runcount_limit": 1},
)
automl.fit(X_train, y_train)
y_pred = automl.predict(X_test)



In [16]:
print(automl.sprint_statistics())
print(automl.leaderboard())

auto-sklearn results:
  Dataset name: 12b61a65-84c9-11ed-bd65-3fd8cc76a64f
  Metric: accuracy
  Best validation score: 0.904947
  Number of target algorithm runs: 1
  Number of successful target algorithm runs: 1
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0

          rank  ensemble_weight           type      cost   duration
model_id                                                           
2            1              1.0  random_forest  0.095053  50.582338


In [17]:
print("F1 score", sklearn.metrics.f1_score(y_test, y_pred, average='macro')) #  get the Score of the final ensemble

F1 score 0.48782095768694655


In [18]:
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred)) #  get the Score of the final ensemble

Accuracy score 0.7088866189989785


## LSTM Trial  
###### it's bad one!
### layers need modifications!


In [19]:
from tqdm import tqdm
import torch
from torch import nn

class NLPDataset(torch.utils.data.Dataset):

  def __init__(self, x, y):
    self.x = torch.tensor(x)
    self.y = torch.tensor(y)

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

#####################################################################################################


class LSTM(nn.Module):
  def __init__(self, embedding_dim=100, hidden_size=100,num_layer= 3 , n_classes=3):

    super(LSTM, self).__init__()
    self.hidden_size = hidden_size
    self.num_layer = num_layer
    self.n_classes = n_classes
    self.embedding_dim = embedding_dim

    self.lstm = nn.LSTM(input_size =embedding_dim, hidden_size = hidden_size,num_layers = num_layer ,batch_first=True)
    self.linear = nn.Linear(hidden_size, n_classes)

  def forward(self, emmbeddings):

    lstm_out, state = self.lstm(emmbeddings)
    final_output = self.linear(lstm_out)
    
    return final_output

        

In [20]:
train_dataset = NLPDataset(X_train.tolist(), (y_train+1).tolist())
dev_dataset = NLPDataset(X_test.tolist(), (y_test+1).tolist())
model = LSTM()
print(model)

LSTM(
  (lstm): LSTM(100, 100, num_layers=3, batch_first=True)
  (linear): Linear(in_features=100, out_features=3, bias=True)
)


In [24]:
def train(model, train_dataset, batch_size=500, epochs=100, learning_rate=0.01):

  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in (train_dataloader):

      train_input, train_label = train_input.to(device), train_label.to(device)

      output = model(train_input)
      
      batch_loss = criterion(output.view(-1, output.shape[-1]), train_label.view(-1))

      total_loss_train += batch_loss

      acc = (output.argmax(dim=-1) == train_label).sum().item()
      total_acc_train += acc

      optimizer.zero_grad()

      batch_loss.backward()

      optimizer.step()
      
    epoch_loss = total_loss_train / (len(train_dataset))

    epoch_acc = total_acc_train / (len(train_dataset))
    if (epoch_num+1) % 10 == 0:
      print(
          f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
          | Train Accuracy: {epoch_acc}\n')

def evaluate(model, test_dataset):

  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=len(test_dataset))

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0
  
  with torch.no_grad():

      test_input, test_label = next(iter(test_dataloader))

      test_input = test_input.to(device)
      test_label = test_label.to(device)

      output = model(test_input)

      # classification report
      report = (classification_report(test_label.cpu(), output.argmax(dim=-1).cpu()))
      print(report)





In [25]:
# it takes a while to train especially if you dont have a GPU 
# Advice: if you dont have a GPU, dont run this cell
train(model, train_dataset, batch_size=500, epochs=500, learning_rate=0.01)

Epochs: 10 | Train Loss: 0.0017871001036837697           | Train Accuracy: 0.5895269188912362

Epochs: 20 | Train Loss: 0.0017014176119118929           | Train Accuracy: 0.6112284744894693

Epochs: 30 | Train Loss: 0.001687617041170597           | Train Accuracy: 0.6180142116381794

Epochs: 40 | Train Loss: 0.001610406907275319           | Train Accuracy: 0.6409320786121248

Epochs: 50 | Train Loss: 0.001586757367476821           | Train Accuracy: 0.6561039626144293

Epochs: 60 | Train Loss: 0.0015397637616842985           | Train Accuracy: 0.6597528967415658

Epochs: 70 | Train Loss: 0.0015123015036806464           | Train Accuracy: 0.6660265027847129

Epochs: 80 | Train Loss: 0.0014970707707107067           | Train Accuracy: 0.6682670763715511

Epochs: 90 | Train Loss: 0.0014487055595964193           | Train Accuracy: 0.6862556814544524

Epochs: 100 | Train Loss: 0.001393694314174354           | Train Accuracy: 0.6996351065872863

Epochs: 110 | Train Loss: 0.0013372763060033321      

In [26]:
evaluate(model, dev_dataset)

              precision    recall  f1-score   support

           0       0.19      0.32      0.24        68
           1       0.26      0.40      0.32       121
           2       0.88      0.75      0.81       790

    accuracy                           0.68       979
   macro avg       0.44      0.49      0.46       979
weighted avg       0.75      0.68      0.71       979

