# Imports


In [218]:
# For Data
import numpy as np
import pandas as pd
import re
from datetime import datetime
from tqdm.notebook import tqdm


# For models
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout, Embedding, SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# For NLP
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

import pickle
from imblearn.over_sampling import SMOTE
from collections import Counter


from preprocessing import *
from plot import *
# from feature_extractor import *
from data_balance import *
from model import *

# Import the dataset

In [219]:
train_file = 'Dataset/train.csv'
devFile = 'Dataset/dev.csv'
train_df = pd.read_csv(train_file)
dev_df = pd.read_csv(devFile)
print(f"Training dataset size = {train_df.shape}")
print(f"Dev dataset size = {dev_df.shape}")

Training dataset size = (6988, 3)
Dev dataset size = (1000, 3)


# Data Preprocessing

In [220]:
# Data cleaning
training_data = cleanData(train_df, 'training', clean = False, clearData = True)
print(f"Cleaned Training dataset size = {training_data.shape}")
# Data cleaning
dev_data = cleanData(dev_df, 'dev', clean = False, clearData = False)
print(f"Cleaned Dev dataset size = {dev_data.shape}")

Cleaned Training dataset size = (6557, 3)
Cleaned Dev dataset size = (1000, 3)


In [221]:
# Some preprocessing steps, like extracting limmitization
training_data = training_data.pipe(processing)    
print(f"Processed Training dataset size = {training_data.shape}")
print(training_data.columns)
# Some preprocessing steps, like extracting limmitization
dev_data = processing(dev_data)   
print(f"Processed dev dataset size = {dev_data.shape}")
print(dev_data.columns)

Processed Training dataset size = (6557, 5)
Index(['text', 'category', 'stance', 'Lemmatization', 'sentiment'], dtype='object')
Processed dev dataset size = (1000, 5)
Index(['text', 'category', 'stance', 'Lemmatization', 'sentiment'], dtype='object')


In [222]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_data.text)
vocab_size = len(tokenizer.word_index) + 1

# integer encode the documents
encoded_docs = tokenizer.texts_to_sequences(training_data.text)
max_length = 30
X_train = pad_sequences(encoded_docs, maxlen=max_length, padding='post')


print(f"X_train size = {X_train.shape}")


X_train size = (6557, 30)


In [223]:
encoded_docs = tokenizer.texts_to_sequences(dev_data.text)
X_test = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
y_test = pd.get_dummies(dev_data['stance']).values

# Feature Engineering

In [224]:
# load the whole embedding into memory
from gensim.models import KeyedVectors
w2v_embeddings_index = {}
TOTAL_EMBEDDING_DIM = 100
embeddings_file = 'embeddings/full_grams_sg_100_twitter.mdl'
w2v_model = KeyedVectors.load(embeddings_file)

In [225]:
for word in w2v_model.wv.vocab:
    w2v_embeddings_index[word] = w2v_model[word]
print('Loaded %s word vectors.'% len(w2v_embeddings_index))

Loaded 1476715 word vectors.


In [226]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, TOTAL_EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    embedding_vector = w2v_embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print("Embedding Matrix shape:", embedding_matrix.shape)

Embedding Matrix shape: (27600, 100)


# Balancing the training dataset

In [227]:
# Balance the dataset with respect to stances
y_train = training_data['stance'].to_numpy()
# y_train = LabelEncoder().fit_transform(y_train)

# # transform the dataset
# oversample = SMOTE()
# X_train, y_train = oversample.fit_resample(X_train, y_train)
# y_train = y_train - 1

# # summarize distribution
# counter = Counter(y_train)
# print("After balancing:")
# for k,v in counter.items():
#     per = v / len(y_train) * 100
#     print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

# print(f"X_train size = {X_train.shape}")
print(f"y_train size = {y_train.shape}")

y_train size = (6557,)


In [228]:
y_train = pd.get_dummies(y_train).values
print('y_test size = ', y_train.shape)

y_test size =  (6557, 3)


In [229]:
print(np.sum([y_train[i][2] == 1 for i in range(len(y_train))]))
print(np.sum([y_train[i][1] == 1 for i in range(len(y_train))]))
print(np.sum([y_train[i][0] == 1 for i in range(len(y_train))]))

5207
954
396


In [230]:
print(f"X_test size = {X_test.shape}")
print(f"y_test size = {y_test.shape}")

X_test size = (1000, 30)
y_test size = (1000, 3)


## Model Building

In [231]:
model = tf.keras.Sequential()
model.add(Embedding(vocab_size, TOTAL_EMBEDDING_DIM, input_length=X_train.shape[1], weights=[embedding_matrix]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 8
batch_size = 16

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,verbose=1)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [232]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1) - 1
y_test = dev_data['stance'].to_numpy()

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.56      0.20      0.29        70
           0       0.37      0.33      0.34       126
           1       0.86      0.93      0.90       804

    accuracy                           0.80      1000
   macro avg       0.60      0.48      0.51      1000
weighted avg       0.78      0.80      0.78      1000

