In [None]:
!pip install dlmslib==0.4

# Import Data

In [2]:
import pandas as pd
import numpy as np
import os

import gc

In [3]:
DATA_ROOT = '../input/'
ORIGINAL_DATA_FOLDER = os.path.join(DATA_ROOT, 'movie-review-sentiment-analysis-kernels-only')
TMP_DATA_FOLDER = os.path.join(DATA_ROOT, 'kaggle_review_sentiment_tmp_data')

In [4]:
train_data_path = os.path.join(ORIGINAL_DATA_FOLDER, 'train.tsv')
test_data_path = os.path.join(ORIGINAL_DATA_FOLDER, 'test.tsv')
sub_data_path = os.path.join(ORIGINAL_DATA_FOLDER, 'sampleSubmission.csv')

train_df = pd.read_csv(train_data_path, sep="\t")
test_df = pd.read_csv(test_data_path, sep="\t")
sub_df = pd.read_csv(sub_data_path, sep=",")

# EDA

In [5]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [6]:
test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [7]:
sub_df.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


# Data Preprocessing

In [8]:
from keras.preprocessing import text
from keras.preprocessing import sequence
import gensim
from sklearn import preprocessing as skp

In [9]:
max_len = 50
embed_size = 300
max_features = 30000

pretrained_w2v_path = os.path.join(DATA_ROOT, "nlpword2vecembeddingspretrained/GoogleNews-vectors-negative300.bin")

### Tokenize Text

In [10]:
full_text = list(train_df['Phrase'].values) + list(test_df['Phrase'].values)

tk = text.Tokenizer(lower = True, filters='')
tk.fit_on_texts(full_text)
train_tokenized = tk.texts_to_sequences(train_df['Phrase'])
test_tokenized = tk.texts_to_sequences(test_df['Phrase'])


X_train = sequence.pad_sequences(train_tokenized, maxlen = max_len)
X_test = sequence.pad_sequences(test_tokenized, maxlen = max_len)

### Build embedding matrix

In [10]:
w2v = gensim.models.KeyedVectors.load_word2vec_format(pretrained_w2v_path, binary=True).wv

word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = None
    if word in w2v:
        embedding_vector = w2v[word]
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
del w2v
gc.collect()

  """Entry point for launching an IPython kernel.


### Encode labels

In [11]:
y_train = train_df['Sentiment']

led = skp.LabelEncoder()
led.fit(y_train.values)

y_train = led.transform(y_train.values)

# Define Keras Model

In [12]:
from dlmslib.keras_models import nlp_models
import tensorflow as tf

from keras import callbacks as kc
from keras import optimizers as ko

## CNN Model

In [None]:
voca_dim = embedding_matrix.shape[0]
time_steps = max_len
output_dim = led.classes_.shape[0]
mlp_dim = 50
num_filters = 32
filter_sizes = [2, 4, 8]
item_embedding = embedding_matrix
mlp_depth = 2
cnn_drop_out = 0.1

In [None]:
model, cnn_cl = nlp_models.build_cnn_model(
    voca_dim, time_steps, output_dim, mlp_dim, num_filters, filter_sizes, 
    item_embedding=item_embedding, mlp_depth=2, cnn_drop_out=cnn_drop_out,
    return_customized_layers=True
)

model.summary()

In [None]:
adam = ko.Adam()
model.compile(adam, loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy",])

file_path = "best_cnn_model.hdf5"
check_point = kc.ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
early_stop = kc.EarlyStopping(monitor = "val_loss", mode = "min", patience=5)
model.fit(X_train, y_train, batch_size=500, epochs=20, validation_split=0.1, callbacks = [check_point, early_stop])

del model
gc.collect()

## Attention RNN Model

In [13]:
voca_dim = embedding_matrix.shape[0]
time_steps = max_len
output_dim = led.classes_.shape[0]
rnn_dim = 100
mlp_dim = 100
item_embedding = embedding_matrix
rnn_depth=1
mlp_depth = 2
rnn_drop_out = 0.5
rnn_state_drop_out = 0.5
gpu=True

In [14]:
model, rnn_cl = nlp_models.build_birnn_attention_model(
    voca_dim, time_steps, output_dim, rnn_dim, mlp_dim, 
    item_embedding=item_embedding, rnn_depth=rnn_depth, mlp_depth=mlp_depth, 
    rnn_drop_out=rnn_drop_out, rnn_state_drop_out=rnn_state_drop_out,
    gpu=gpu, return_customized_layer=True
)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input0 (InputLayer)             (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_layer0 (Embedding)    (None, 50, 300)      5843700     input0[0][0]                     
__________________________________________________________________________________________________
bi_lstm_layer0 (Bidirectional)  (None, 50, 200)      320800      embedding_layer0[0][0]           
__________________________________________________________________________________________________
rnn_batch_norm_layer0 (BatchNor (None, 50, 200)      800         bi_lstm_layer0[0][0]             
__________________________________________________________________________________________________
permuted_a

In [15]:
adam = ko.Adam(clipnorm=2.0)
model.compile(adam, loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy",])

file_path = "best_birnn_attention_model.hdf5"
check_point = kc.ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
early_stop = kc.EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)
model.fit(X_train, y_train, batch_size=500, epochs=20, validation_split=0.1, callbacks = [check_point, early_stop])

del model
gc.collect()

Train on 140454 samples, validate on 15606 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 1.23072, saving model to best_birnn_attention_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 1.23072 to 1.02961, saving model to best_birnn_attention_model.hdf5
Epoch 3/20

Epoch 00003: val_loss did not improve from 1.02961
Epoch 4/20
  6500/140454 [>.............................] - ETA: 5:21 - loss: 0.8029 - sparse_categorical_accuracy: 0.6660

KeyboardInterrupt: 

## RNN-CNN Model

In [17]:
voca_dim = embedding_matrix.shape[0]
time_steps = max_len
output_dim = led.classes_.shape[0]
rnn_dim = 100
mlp_dim = 60
item_embedding = embedding_matrix
rnn_depth=1
mlp_depth = 2
num_filters = 32
filter_sizes = [2, 4, 8]
cnn_drop_out = 0.1
rnn_drop_out = 0.5
rnn_state_drop_out = 0.5
gpu=True

In [19]:
model, rc_cl = nlp_models.build_birnn_cnn_model(
    voca_dim, time_steps, output_dim, rnn_dim, mlp_dim, num_filters, filter_sizes, 
    item_embedding=item_embedding, rnn_depth=rnn_depth, mlp_depth=mlp_depth,
    rnn_drop_out=rnn_drop_out, rnn_state_drop_out=rnn_state_drop_out, cnn_drop_out=cnn_drop_out,
    gpu=gpu, return_customized_layers=True
)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input0 (InputLayer)             (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_layer0 (Embedding)    (None, 50, 300)      5843700     input0[0][0]                     
__________________________________________________________________________________________________
bi_lstm_layer0 (Bidirectional)  (None, 50, 200)      320800      embedding_layer0[0][0]           
__________________________________________________________________________________________________
rnn_batch_norm_layer0 (BatchNor (None, 50, 200)      800         bi_lstm_layer0[0][0]             
__________________________________________________________________________________________________
conv1d_1 (

In [20]:
adam = ko.Adam(clipnorm=2.0)
model.compile(adam, loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy",])

file_path = "best_birnn_cnn_model.hdf5"
check_point = kc.ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
early_stop = kc.EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)
model.fit(X_train, y_train, batch_size=500, epochs=20, validation_split=0.1, callbacks = [check_point, early_stop])

del model
gc.collect()

Train on 140454 samples, validate on 15606 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 1.15736, saving model to best_birnn_cnn_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 1.15736 to 1.11199, saving model to best_birnn_cnn_model.hdf5
Epoch 3/20

Epoch 00003: val_loss did not improve from 1.11199
Epoch 4/20

KeyboardInterrupt: 

# Make Prediction

In [13]:
from keras import models
from dlmslib.keras_models import layers

In [15]:
cnn_model = models.load_model("best_cnn_model.hdf5")
rnn_model = models.load_model("best_birnn_attention_model.hdf5", custom_objects={'AttentionWeight': layers.AttentionWeight})
rnn_cnn_model = models.load_model("best_birnn_cnn_model.hdf5")

pred_tmp = cnn_model.predict(X_test, batch_size = 1024, verbose = 1)
pred = pred_tmp
pred_tmp = rnn_model.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred_tmp
pred_tmp = rnn_cnn_model.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred_tmp



In [17]:
predictions = np.round(np.argmax(pred, axis=1)).astype(int)
sub_df['Sentiment'] = predictions
sub_df.to_csv("submission.csv", index=False)