## Import

In [91]:
import os 
import tools

import tools.baseline as bt
import tools.data as data_tools
import tools.baseline as baseline_tools
import numpy as np
from sklearn.metrics import classification_report

from gensim.models import KeyedVectors
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
from keras.utils import to_categorical
from keras.models import Model, Input
from collections import defaultdict

from keras.layers import *


In [11]:
dt_path = "./data_files/train_conll_spanglish.txt"
embed_path = "GoogleNews-vectors-negative300.bin"

data = data_tools.Data(dt_path, shuffle=True, split=0.8)

Xtrain, Ytrain, Xtest, Ytest = data.output_data()

#print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)

print(Xtrain[:5])

[['ASI', 'de', 'sencillas', '#laroca', '#diamantes', '#love', '#boda', '#tantan', '💍💎👰👰', '#lasnovias', '#soon', '#pronto', '@chiquibabyla', '…', 'http://t.co/l7PmfXgLgG'], ['INFAMOUS', 'SECOND', 'SON', '|', 'KRONNO', 'ZOMBER', '|', '¿', 'HEROE', 'O', 'VILLANO', '?', '(', 'Prod', '.', 'por', 'Sa', '..', '.', 'https://t.co/3p0B0FdM5w', 'vía', '@YouTube'], ['Apartamento', '317', '.', 'Shit', "'s", 'gonna', 'go', 'doooooown', '@lorraine_otero', '✨✨✨'], ['Slippery', 'slope', ':', 'Algo', 'así', 'como', 'bola', 'de', 'nieve', '(', 'que', 'lleva', 'a', 'algo', 'peor', ')', '.', 'Some', 'people', 'believe', 'that', 'euthanasia', 'is', 'the', 'slippery', 'slope', 'to', 'murder', '.'], ['La', 'kid', '👶🏽👶🏽👶🏽👶🏽', 'https://t.co/HbmdM3NOZw']]


In [30]:
toki = Tokenizer(oov_token = 'UNK')
toki.fit_on_texts(Xtrain)

In [43]:
Xtrain_seq = toki.texts_to_sequences(Xtrain)
Xtest_seq = toki.texts_to_sequences(Xtest)

In [32]:
print(Xtrain_seq[:5])

[[311, 3, 6438, 6439, 9937, 1122, 2246, 9938, 9939, 9940, 4840, 9941, 1243, 28, 9942], [1801, 1171, 124, 49, 107, 125, 49, 127, 3317, 81, 3318, 24, 51, 1425, 2, 32, 6440, 18, 2, 6441, 151, 93], [6442, 6443, 2, 205, 90, 499, 162, 9943, 2247, 9944], [3911, 6444, 4, 117, 173, 45, 6445, 3, 4841, 51, 8, 1327, 6, 117, 854, 69, 2, 363, 628, 1538, 65, 9945, 53, 26, 3911, 6444, 31, 4842, 2], [13, 1996, 9946, 9947]]


In [47]:
word2index = toki.word_index
word2index['PAD'] = 0

1


In [48]:
index2word = toki.index_word
index2word[0] = 'PAD'
print(index2word[1])

UNK


In [41]:
#get max length of words

lens = [len(word) for word in word2index.keys()]
max_len = max(lens)

print(max_len)

69


In [45]:
#pre-pad the tweets with value 0
Xtrain_pad = pad_sequences(Xtrain_seq, maxlen = max_len)
Xtest_pad = pad_sequences(Xtest_seq, maxlen = max_len)

In [54]:
Ytrain[:5]

['positive', 'neutral', 'positive', 'negative', 'positive']

In [56]:
label_dict = {
    'neutral':0,
    'positive':1,
    'negative':2
    }

In [57]:
Ytrain = [label_dict[label] for label in Ytrain]
Ytest = [label_dict[label] for label in Ytest] 

In [58]:
Ytrain_cat = np.asarray([to_categorical(label, num_classes = 3) for label in Ytrain])
Ytest_cat = np.asarray([to_categorical(label, num_classes = 3) for label in Ytest])

In [60]:
Ytrain_cat[:5]

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]], dtype=float32)

In [62]:
print('Shape of data tensor:', Xtrain_pad.shape)
print('Shape of label tensor:', Ytrain_cat.shape)

Shape of data tensor: (12000, 69)
Shape of label tensor: (12000, 3)


### Prepare the Embedding Layer

In [29]:
embeddings = KeyedVectors.load_word2vec_format(embed_path, binary=True)

In [64]:
#make an index2embedding dict

index2emb = dict()

for i, w in index2word.items():
    try:
        embed = embeddings[w]
    except KeyError:
        embed = embeddings['UNK']
    index2emb[i] = embed


In [65]:
#compute embedding matrix

embedding_matrix = np.zeros((len(word2index) + 1, 300))
for word, i in word2index.items():
    embedding_vector = index2emb[i]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [66]:
#load embedding matrix into embedding layer


embedding_layer = Embedding(len(word2index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)

### Building the classifier - FFNN

In [95]:
epochs = 100
lr = 0.05
batch = 512
activation = 'softmax'
loss_function = 'categorical_crossentropy'

In [84]:
sequence_input = Input(shape=(max_len,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

#flat = Flatten()(embedded_sequences)

output_1 = LSTM(64, activation='relu')(embedded_sequences)
#output_2 = Dense(64, activation='relu')(output_1)
predictions = Dense(3, activation='softmax')(output_1)

model = Model(inputs=sequence_input, outputs=predictions)

model.compile(loss=loss_function, optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 69)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 69, 300)           8363700   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dense_13 (Dense)             (None, 3)                 195       
Total params: 8,457,335
Trainable params: 93,635
Non-trainable params: 8,363,700
_________________________________________________________________


In [96]:
model.fit(Xtrain_pad, Ytrain_cat, batch_size = batch, epochs=epochs, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100


Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x7fcc8c29e780>

In [86]:
predictions = model.predict(Xtest_pad)

In [92]:
pred = np.argmax(predictions, axis=1)
Ytest_converted = np.argmax(Ytest_cat, axis=1)

print(classification_report(Ytest_converted, pred))

              precision    recall  f1-score   support

           0       0.40      0.28      0.33      1009
           1       0.55      0.74      0.63      1489
           2       0.42      0.23      0.29       502

    accuracy                           0.50      3000
   macro avg       0.45      0.42      0.42      3000
weighted avg       0.48      0.50      0.47      3000

