In [39]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from keras_preprocessing.sequence import pad_sequences

df = pd.read_pickle('final_df.pkl')
X= df['tokenized']
y= df['sentiment']

##0.2 so that training data is 80% and test data 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69, stratify = y)

## Changing it to a list of lists as input to Word2Vec
corpus = list(X_train)

##Defining the size and running w2v
size = 300
w2v_model = Word2Vec(sentences=corpus, vector_size = size)
word_vectors = w2v_model.wv

##Tokenizing
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)
dic_vocabulary = tokenizer.word_index

maxlen = 100
##Padding the sequences
train_token_seq = tokenizer.texts_to_sequences(corpus)
X_train_pad = pad_sequences(train_token_seq, maxlen=maxlen, padding="post", truncating="post")

## start the matrix (length of vocabulary x vector size) with all 0s
embeddings = np.zeros((len(dic_vocabulary)+1, size))
for word,idx in dic_vocabulary.items():
    ## update the row with vector
    try:
        embeddings[idx] =  word_vectors[word]
    ## if word not in model then skip and the row stays all 0s
    except:
        pass

##Trying to see if it works
word = "nice"
print("dic[word]:", dic_vocabulary[word], "|idx")
print("embeddings[idx]:", embeddings[dic_vocabulary[word]].shape,
      "|vector")

KeyboardInterrupt: 

In [None]:
import keras.backend as K

def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [45]:
##Making the framework for the neural network
import keras
from keras import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense

model = Sequential()
model.add(Embedding(input_dim=embeddings.shape[0], output_dim=embeddings.shape[1],
                    weights=[embeddings], input_length=X_train_pad.shape[1], trainable=False, input_shape=(maxlen,)))
model.add(LSTM(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=[get_f1])
model.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 100, 300)          53060100  
                                                                 
 lstm_16 (LSTM)              (None, 16)                20288     
                                                                 
 dense_14 (Dense)            (None, 1)                 17        
                                                                 
Total params: 53,080,405
Trainable params: 20,305
Non-trainable params: 53,060,100
_________________________________________________________________


In [46]:
##Defining class weight as positive class is twice as big as the negative class
class_weight = {0: 2.,
                1: 1.}
lstm_model = model.fit(X_train_pad, y_train, batch_size=64, epochs=5, class_weight=class_weight)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


import matplotlib.pyplot as plt

plt.plot(lstm_model.history['get_f1'])
plt.plot(lstm_model.history['val_get_f1'])
plt.title('model F1 score')
plt.ylabel('F1 score')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(lstm_model.history['loss'])
plt.plot(lstm_model.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [47]:
##Preprocessin the test set
corpus2 = list(X_test)
tokenizer.fit_on_texts(corpus)
X_test_seq = tokenizer.texts_to_sequences(corpus2)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding="post", truncating="post")

In [48]:
predictions = model.predict(X_test_pad)
lstm_predictions = list(map(lambda x: 0 if x<0.5 else 1, predictions))



In [49]:
from sklearn.metrics import f1_score
f1_score(y_test, lstm_predictions)

0.9205916805143153

In [50]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, lstm_predictions), recall_score(y_test, lstm_predictions)

(0.9577169415632262, 0.886237281023812)

In [51]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, lstm_predictions)

array([[24560,  2238],
       [ 6507, 50691]], dtype=int64)