In [1]:
import pandas as pd
import numpy as np
import json
import os
import sys
import pickle
from tqdm.notebook import tqdm as tqdm_n
from nltk.tokenize import word_tokenize 
from sklearn.model_selection import train_test_split
import gc
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score
from tensorflow.python.keras import backend as K

In [2]:
data_path = 'data'

In [3]:
train_data = np.load(os.path.join(data_path, 'twitter_train_vectors.npy'), allow_pickle=True)
test_data = np.load(os.path.join(data_path, 'twitter_test_vectors.npy'), allow_pickle=True)
train_labels = np.load(os.path.join(data_path, 'twitter_train_labels.npy'), allow_pickle=True)
test_labels = np.load(os.path.join(data_path, 'twitter_test_labels.npy'), allow_pickle=True)

In [4]:
def average_on_window(data, label, size):
    new_data = []
    new_labels = []
    for i in tqdm_n(range(len(data))):
        sample = data[i]
        if len(sample) >= size:
            new_sample = []
            count = int(np.floor((len(sample) / size)))
            rest = int(len(sample) / size)
            for j in range(size-1):
                new_sample.append(np.array(sample[j*count:(j+1)*count].mean(axis=0)))
            if rest != 0:
                new_sample.append(np.array(sample[(size-1)*count:(size)*count+rest].mean(axis=0)))
            else:
                new_sample.append(np.array(sample[(size-1)*count:(size)*count].mean(axis=0)))
            new_data.append(np.array(new_sample))
            new_labels.append(label[i])
    return np.array(new_data), np.array(new_labels)

In [5]:
X_train, y_train = average_on_window(train_data, train_labels, 2)

HBox(children=(FloatProgress(value=0.0, max=119999.0), HTML(value='')))




In [6]:
X_test, y_test = average_on_window(test_data, test_labels, 2)

HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))




In [7]:
tf.random.set_seed(42)
np.random.seed(42)
model = Sequential()

model.add(LSTM(5, return_sequences = True, activation='selu'))
model.add(Dropout(0.2))
model.add(LSTM(200, return_sequences=True, activation='selu'))
model.add(Dropout(0.2))
model.add(LSTM(100, return_sequences=False, activation='selu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='selu'))
model.add(Dense(50, activation='selu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])



In [8]:
#model.fit(X_train, y_train, epochs=50, validation_split=0.1, batch_size=1024, callbacks=callbacks)

In [9]:
from loop import TrainingLoop

In [10]:
optimizer = tf.keras.optimizers.Adam()
loss_function = tf.keras.losses.BinaryCrossentropy()

batch_size = 1024
epochs = 50


In [11]:
train_metrics = tf.keras.metrics.BinaryAccuracy()
val_metrics = tf.keras.metrics.BinaryAccuracy()

In [12]:
training = TrainingLoop(model, X_train, y_train, loss_function, optimizer, train_metrics, val_metrics, validation_split=0.1, batch_size=batch_size)

In [13]:
training.train(epochs)

Epoch 1/50	Loss: 0.5443	Metrics: 0.6929: 	100% | 94/94 [00:02<00:00, 43.38it/s]
Epoch 2/50	Loss: 0.5131	Metrics: 0.7543: 	100% | 94/94 [00:00<00:00, 190.58it/s]
Epoch 3/50	Loss: 0.4963	Metrics: 0.7658: 	100% | 94/94 [00:00<00:00, 182.17it/s]
Epoch 4/50	Loss: 0.4976	Metrics: 0.7696: 	100% | 94/94 [00:00<00:00, 185.43it/s]
Epoch 5/50	Loss: 0.4915	Metrics: 0.7735: 	100% | 94/94 [00:00<00:00, 189.05it/s]
Epoch 6/50	Loss: 0.4872	Metrics: 0.7762: 	100% | 94/94 [00:00<00:00, 191.88it/s]
Epoch 7/50	Loss: 0.4862	Metrics: 0.7765: 	100% | 94/94 [00:00<00:00, 186.93it/s]
Epoch 8/50	Loss: 0.4832	Metrics: 0.7780: 	100% | 94/94 [00:00<00:00, 191.28it/s]
Epoch 9/50	Loss: 0.4813	Metrics: 0.7796: 	100% | 94/94 [00:00<00:00, 187.42it/s]
Epoch 10/50	Loss: 0.4772	Metrics: 0.7802: 	100% | 94/94 [00:00<00:00, 182.00it/s]
Epoch 11/50	Loss: 0.4736	Metrics: 0.7816: 	100% | 94/94 [00:00<00:00, 190.42it/s]
Epoch 12/50	Loss: 0.4705	Metrics: 0.7820: 	100% | 94/94 [00:00<00:00, 185.25it/s]
Epoch 13/50	Loss: 0.4713	M

In [14]:
model.evaluate(X_test, y_test, batch_size=batch_size)



[0.45297864079475403, 0.7865575551986694]

Usually they get an accuracy of about 83% on this dataset. I am using only a tenth of the original size and I achieved 78% with an LSTM network. This could be better with some more tuning but in this project we're focusing on our batch selection algorithm.