In [0]:
import keras as k 
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense , Embedding , LSTM , GRU
from keras import Model , Input
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [54]:
file = open("glove.6B.100d.txt")
num_vocab = 0
word_to_vec = {}
for line in file.readlines():
  
  row = line.strip().split(' ')
  word = row[0]
  vec = row[1:] 
  num_vocab += 1
  word_to_vec[word] = vec
  
print("Glove preprocessed")

file.close()

Glove preprocessed


In [55]:
train = pd.read_csv("train.tsv" , delimiter = '\t')
test = pd.read_csv("test.tsv" , delimiter = '\t')
submission = pd.read_csv("../content/.kaggle/competitions/movie-review-sentiment-analysis-kernels-only/sampleSubmission.csv")

In [55]:
train

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [56]:
X_train = train['Phrase'].fillna('na').values
y_train = train['Sentiment'].values
X_test = test['Phrase'].fillna('na').values

In [57]:
y_train = to_categorical(y_train)

In [58]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(X_train) + list(X_test))

In [59]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [60]:
maxlen = 50

X_train = pad_sequences(X_train , maxlen)
X_test = pad_sequences(X_test , maxlen)

In [61]:
word_index = tokenizer.word_index
min_words = min(len(word_index) , num_vocab)
print(min_words)

17780


In [62]:
embedding_matrix = np.zeros((min_words , 100))

In [63]:
added_words = 0
for word , i in word_index.items():
    if i >= min_words:
        continue
    vector = word_to_vec.get(word)
    if vector:
        added_words+=1
        embedding_matrix[i] = vector

In [64]:
del word_to_vec
del tokenizer

In [65]:
X_train , X_val , y_train , y_val = train_test_split(X_train , y_train , train_size = 0.95 , random_state = 0)



In [18]:
def model(input_shape):
    input = Input(input_shape)
    X = Embedding(input_dim = min_words , output_dim = 100 , weights = [embedding_matrix] , trainable = False)(input)
    X = GRU(70 , return_sequences = False , recurrent_dropout = 0.25 , dropout = 0.25)(X)
    X = Dense(5 , activation = 'softmax')(X)
  
    return Model(inputs = input , outputs = X)

In [19]:
movie_model = model((50,))

In [20]:
movie_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 50, 100)           1528800   
_________________________________________________________________
gru_2 (GRU)                  (None, 70)                35910     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 355       
Total params: 1,565,065
Trainable params: 36,265
Non-trainable params: 1,528,800
_________________________________________________________________


In [21]:
model_saver = ModelCheckpoint("my_model.h5" , monitor = "val_loss" , save_best_only = True)

In [22]:
movie_model.compile(loss = 'categorical_crossentropy' , optimizer = 'rmsprop' , metrics = ['accuracy'])

In [26]:
movie_model.fit(X_train, y_train, batch_size=1024, epochs=10, validation_data=(X_val, y_val),
                 callbacks=[model_saver])

Train on 148257 samples, validate on 7803 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa35904ccf8>

In [66]:
prediction = movie_model.predict(X_test , batch_size = 1024 , verbose = 1)



In [67]:
submission['Sentiment'] = prediction.argmax(axis = 1)

In [68]:
submission.to_csv("submission.csv")