In [1]:
import pandas as pd
import numpy as np
import json
import os
import sys
import pickle
from tqdm.notebook import tqdm as tqdm_n
from nltk.tokenize import word_tokenize 
from sklearn.model_selection import train_test_split
import gc
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score
from tensorflow.python.keras import backend as K

In [2]:
data_path = 'data'

In [3]:
# loading the training data
train_data = np.load(os.path.join(data_path, 'twitter_train_vectors.npy'), allow_pickle=True)
test_data = np.load(os.path.join(data_path, 'twitter_test_vectors.npy'), allow_pickle=True)
train_labels = np.load(os.path.join(data_path, 'twitter_train_labels.npy'), allow_pickle=True)
test_labels = np.load(os.path.join(data_path, 'twitter_test_labels.npy'), allow_pickle=True)

## Using word2vec as an input

We have sentences in this dataset with different lengths, which means we can't feed these diractly into a neural network. As a solution I calculate the mean of the word vectors on a given window to get a fixed sentence length in every sample and drop the ones that do not meet the minimum sentence length criteria. The function below does exactly that.


In [4]:
def average_on_window(data, label, size):
    new_data = []
    new_labels = []

    for i in tqdm_n(range(len(data))):
        sample = data[i]
        if len(sample) >= size:
            new_sample = []

            # how many elements in a window
            count = int(np.floor((len(sample) / size)))

            # how many left
            rest = int(len(sample) / size)
            
            # calculating mean in every window
            for j in range(size-1):
                new_sample.append(np.array(sample[j*count:(j+1)*count].mean(axis=0)))
            if rest != 0:
                new_sample.append(np.array(sample[(size-1)*count:(size)*count+rest].mean(axis=0)))
            else:
                new_sample.append(np.array(sample[(size-1)*count:(size)*count].mean(axis=0)))
            new_data.append(np.array(new_sample))
            new_labels.append(label[i])
    return np.array(new_data), np.array(new_labels)

In [5]:
X_train, y_train = average_on_window(train_data, train_labels, 2)

HBox(children=(FloatProgress(value=0.0, max=119999.0), HTML(value='')))




In [6]:
X_test, y_test = average_on_window(test_data, test_labels, 2)

HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))




In [7]:
# defining random seed
tf.random.set_seed(42)
np.random.seed(42)

# building an LSTM model
model = Sequential()
model.add(LSTM(5, return_sequences = True, activation='selu'))
# using dropout as regularization
model.add(Dropout(0.2))
model.add(LSTM(200, return_sequences=True, activation='selu'))
model.add(Dropout(0.2))
model.add(LSTM(100, return_sequences=False, activation='selu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='selu'))
model.add(Dense(50, activation='selu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])



In [8]:
#model.fit(X_train, y_train, epochs=50, validation_split=0.1, batch_size=1024, callbacks=callbacks)

In [9]:
from loop import TrainingLoop

In [10]:
optimizer = tf.keras.optimizers.Adam()
loss_function = tf.keras.losses.BinaryCrossentropy()

# large batch size to test if our algorithm will be able to handle this
batch_size = 1024
epochs = 50

log_path = 'logs/original/sentiment.csv'

In [11]:
# defining metrics
train_metrics = tf.keras.metrics.BinaryAccuracy()
val_metrics = tf.keras.metrics.BinaryAccuracy()

In [12]:
# creating the training loop object
training = TrainingLoop(model, X_train, y_train, loss_function, optimizer, train_metrics, val_metrics, validation_split=0.1, batch_size=batch_size, log_file=log_path)

In [13]:
# training
training.train(epochs)

Epoch 1/50	Loss: 0.5043	Metrics: 0.6983: 	Validation metrics: 0.7473: 	100% | 104/104 [00:02<00:00, 41.39it/s]
Epoch 2/50	Loss: 0.4802	Metrics: 0.7557: 	Validation metrics: 0.7632: 	100% | 104/104 [00:00<00:00, 155.87it/s]
Epoch 3/50	Loss: 0.4763	Metrics: 0.7676: 	Validation metrics: 0.7715: 	100% | 104/104 [00:00<00:00, 158.01it/s]
Epoch 4/50	Loss: 0.4678	Metrics: 0.7712: 	Validation metrics: 0.7763: 	100% | 104/104 [00:00<00:00, 154.47it/s]
Epoch 5/50	Loss: 0.4661	Metrics: 0.7743: 	Validation metrics: 0.7783: 	100% | 104/104 [00:00<00:00, 158.19it/s]
Epoch 6/50	Loss: 0.4577	Metrics: 0.7766: 	Validation metrics: 0.7800: 	100% | 104/104 [00:00<00:00, 157.13it/s]
Epoch 7/50	Loss: 0.4593	Metrics: 0.7782: 	Validation metrics: 0.7836: 	100% | 104/104 [00:00<00:00, 154.64it/s]
Epoch 8/50	Loss: 0.4565	Metrics: 0.7796: 	Validation metrics: 0.7820: 	100% | 104/104 [00:00<00:00, 158.28it/s]
Epoch 9/50	Loss: 0.4499	Metrics: 0.7807: 	Validation metrics: 0.7825: 	100% | 104/104 [00:00<00:00, 157.2

In [14]:
# quick evaluation
model.evaluate(X_test, y_test, batch_size=batch_size)



[0.45750078558921814, 0.7874637842178345]

Usually they get an accuracy of about 83% on this dataset. I am using only a tenth of the original size and I achieved 78% with an LSTM network. This could be better with some more tuning but in this project we're focusing on our batch selection algorithm.