In [1]:
import pandas as pd
import numpy as np
import json
import os
import sys
import pickle
from tqdm.notebook import tqdm as tqdm_n
from nltk.tokenize import word_tokenize 
from sklearn.model_selection import train_test_split
import gc
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score
from tensorflow.python.keras import backend as K
from loop import TrainingLoop

In [2]:
data_path = 'data'
train_data = np.load(os.path.join(data_path, 'twitter_train_vectors.npy'), allow_pickle=True)
test_data = np.load(os.path.join(data_path, 'twitter_test_vectors.npy'), allow_pickle=True)
train_labels = np.load(os.path.join(data_path, 'twitter_train_labels.npy'), allow_pickle=True)
test_labels = np.load(os.path.join(data_path, 'twitter_test_labels.npy'), allow_pickle=True)

In [3]:
def average_on_window(data, label, size):
    new_data = []
    new_labels = []
    for i in tqdm_n(range(len(data))):
        sample = data[i]
        if len(sample) >= size:
            new_sample = []
            count = int(np.floor((len(sample) / size)))
            rest = int(len(sample) / size)
            for j in range(size-1):
                new_sample.append(np.array(sample[j*count:(j+1)*count].mean(axis=0)))
            if rest != 0:
                new_sample.append(np.array(sample[(size-1)*count:(size)*count+rest].mean(axis=0)))
            else:
                new_sample.append(np.array(sample[(size-1)*count:(size)*count].mean(axis=0)))
            new_data.append(np.array(new_sample))
            new_labels.append(label[i])
    return np.array(new_data), np.array(new_labels)

In [4]:
X_train, y_train = average_on_window(train_data, train_labels, 2)
X_test, y_test = average_on_window(test_data, test_labels, 2)

HBox(children=(FloatProgress(value=0.0, max=119999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))




In [5]:
X_test.shape

(39725, 2, 250)

In [6]:
tf.random.set_seed(42)
np.random.seed(42)
model = Sequential()

model.add(LSTM(5, return_sequences = True, activation='selu'))
model.add(Dropout(0.2))
model.add(LSTM(200, return_sequences=True, activation='selu'))
model.add(Dropout(0.2))
model.add(LSTM(100, return_sequences=False, activation='selu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='selu'))
model.add(Dense(50, activation='selu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])



In [7]:
optimizer = tf.keras.optimizers.Adam()
loss_function = tf.keras.losses.BinaryCrossentropy()

batch_size = 1024
epochs = 50

In [8]:
train_metrics = tf.keras.metrics.BinaryAccuracy()
val_metrics = tf.keras.metrics.BinaryAccuracy()

In [9]:
@tf.function
def calc_loss(x_train, y_train, model, loss_function):
    with tf.GradientTape() as tape:
        logits = model(x_train, training=False)
        loss_value = loss_function(y_train, logits)
    return loss_value


length = 10
def windowed_batch_selector(data, idx, model, loss_function ):
    largest_loss = 0
    largest_loss_idx = idx

    if idx < len(data) - length:
        for i in range(idx, idx+length):
            x_batch_train = data[i][0]
            y_batch_train = data[i][1]
            loss = calc_loss(x_batch_train, y_batch_train, model, loss_function)
            if loss > largest_loss:
                largest_loss = loss
                largest_loss_idx = i
        return largest_loss_idx
    else:
        loss = calc_loss(data[idx][0], data[idx][1], model, loss_function)
        return idx


losses = []
def sorting_batch_selector(data, idx, model, loss_function):
    global losses
    if idx == 0:
        for i in range(len(data)):
            x_batch_train = data[i][0]
            y_batch_train = data[i][1]
            losses.append([i, float(calc_loss(x_batch_train, y_batch_train, model, loss_function))])
        losses = sorted(losses, key=lambda x:x[1], reverse=True)


    return_idx = losses[idx][0]
    if idx == len(data)-1:
        losses.clear()
    
    return return_idx

In [10]:
log_path = 'logs/sorting/sentiment.csv'

In [11]:
training = TrainingLoop(model, X_train, y_train, 
                        loss_function, 
                        optimizer, 
                        train_metrics, 
                        val_metrics, 
                        validation_split=0.1, 
                        batch_size=batch_size, 
                        batch_selection=sorting_batch_selector, 
                        log_file=log_path
                        )
training.train(epochs)

Epoch 1/50	Loss: 0.4905	Metrics: 0.6984: 	Validation metrics: 0.7514: 	100% | 104/104 [00:02<00:00, 35.36it/s]
Epoch 2/50	Loss: 0.4572	Metrics: 0.7541: 	Validation metrics: 0.7638: 	100% | 104/104 [00:00<00:00, 123.04it/s]
Epoch 3/50	Loss: 0.4504	Metrics: 0.7658: 	Validation metrics: 0.7721: 	100% | 104/104 [00:00<00:00, 125.08it/s]
Epoch 4/50	Loss: 0.4446	Metrics: 0.7702: 	Validation metrics: 0.7743: 	100% | 104/104 [00:00<00:00, 126.08it/s]
Epoch 5/50	Loss: 0.4335	Metrics: 0.7732: 	Validation metrics: 0.7739: 	100% | 104/104 [00:00<00:00, 118.51it/s]
Epoch 6/50	Loss: 0.4292	Metrics: 0.7760: 	Validation metrics: 0.7742: 	100% | 104/104 [00:00<00:00, 120.78it/s]
Epoch 7/50	Loss: 0.4290	Metrics: 0.7782: 	Validation metrics: 0.7761: 	100% | 104/104 [00:00<00:00, 125.76it/s]
Epoch 8/50	Loss: 0.4264	Metrics: 0.7794: 	Validation metrics: 0.7747: 	100% | 104/104 [00:00<00:00, 122.60it/s]
Epoch 9/50	Loss: 0.4291	Metrics: 0.7813: 	Validation metrics: 0.7772: 	100% | 104/104 [00:00<00:00, 118.6

In [12]:
model.evaluate(X_test, y_test, batch_size=batch_size)



[0.456454873085022, 0.7865575551986694]