In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from keras_preprocessing.sequence import pad_sequences

df = pd.read_pickle('final_df.pkl')
X= df['tokenized']
y= df['sentiment']

##0.2 so that training data is 80% and test data 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69, stratify = y)

## Changing it to a list of lists as input to Word2Vec
corpus = list(X_train)

##Defining the size and running w2v
size = 300
w2v_model = Word2Vec(sentences=corpus, vector_size = size, sg=1)
word_vectors = w2v_model.wv

##Tokenizing
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)
dic_vocabulary = tokenizer.word_index

maxlen = 100
##Padding the sequences
train_token_seq = tokenizer.texts_to_sequences(corpus)
X_train_pad = pad_sequences(train_token_seq, maxlen=maxlen, padding="post", truncating="post")

## start the matrix (length of vocabulary x vector size) with all 0s
embeddings = np.zeros((len(dic_vocabulary)+1, size))
for word,idx in dic_vocabulary.items():
    ## update the row with vector
    try:
        embeddings[idx] =  word_vectors[word]
    ## if word not in model then skip and the row stays all 0s
    except:
        pass

##Trying to see if it works
word = "nice"
print("dic[word]:", dic_vocabulary[word], "|idx")
print("embeddings[idx]:", embeddings[dic_vocabulary[word]].shape,
      "|vector")

dic[word]: 209 |idx
embeddings[idx]: (300,) |vector


In [2]:
##Making the framework for the neural network
from keras import Sequential
from keras.layers import Embedding, GRU, Dropout, Dense

model = Sequential()
model.add(Embedding(input_dim=embeddings.shape[0], output_dim=embeddings.shape[1],
                    weights=[embeddings], input_length=X_train_pad.shape[1], trainable=False, input_shape=(maxlen,)))
model.add(GRU(64, activation='relu', return_sequences=True))
model.add(GRU(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics='accuracy')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 300)          53060100  
                                                                 
 gru (GRU)                   (None, 100, 64)           70272     
                                                                 
 gru_1 (GRU)                 (None, 32)                9408      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 53,139,813
Trainable params: 79,713
Non-trainable params: 53,060,100
_________________________________________________________________


In [3]:
##Defining class weight as positive class is twice as big as the negative class

gru_model = model.fit(X_train_pad, y_train, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


import matplotlib.pyplot as plt

plt.plot(gru_model.history['get_f1'])
plt.plot(gru_model.history['val_get_f1'])
plt.title('model F1 score')
plt.ylabel('F1 score')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(gru_model.history['loss'])
plt.plot(gru_model.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [4]:
##Preprocessin the test set
corpus2 = list(X_test)
tokenizer.fit_on_texts(corpus)
X_test_seq = tokenizer.texts_to_sequences(corpus2)
X_test_pad = pad_sequences(X_test_seq, maxlen=100, padding="post", truncating="post")

In [5]:
predictions = model.predict(X_test_pad)
gru_predictions = list(map(lambda x: 0 if x<0.5 else 1, predictions))



In [6]:
from sklearn.metrics import f1_score
f1_score(y_test, gru_predictions)

0.9463133300327453

In [7]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, gru_predictions), recall_score(y_test, gru_predictions)

(0.9403103585175979, 0.9523934403300814)

In [8]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, gru_predictions)

array([[23340,  3458],
       [ 2723, 54475]], dtype=int64)