# Import Data

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
DATA_ROOT = '../input/'
ORIGINAL_DATA_FOLDER = os.path.join(DATA_ROOT, 'movie-review-sentiment-analysis-kernels-only')
TMP_DATA_FOLDER = os.path.join(DATA_ROOT, 'kaggle_review_sentiment_tmp_data')

In [3]:
train_data_path = os.path.join(ORIGINAL_DATA_FOLDER, 'train.tsv')
test_data_path = os.path.join(ORIGINAL_DATA_FOLDER, 'test.tsv')
sub_data_path = os.path.join(ORIGINAL_DATA_FOLDER, 'sampleSubmission.csv')

train_df = pd.read_csv(train_data_path, sep="\t")
test_df = pd.read_csv(test_data_path, sep="\t")
sub_df = pd.read_csv(sub_data_path, sep=",")

# EDA

In [4]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [6]:
sub_df.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


# Data Preprocessing

In [7]:
import nltk
from keras.preprocessing import text
from keras.preprocessing import sequence
import gensim
from sklearn import preprocessing as skp

Using TensorFlow backend.


In [8]:
max_len = 50
embed_size = 300
max_features = 30000

pretrained_w2v_path = os.path.join(DATA_ROOT, "nlpword2vecembeddingspretrained/GoogleNews-vectors-negative300.bin")

### Tokenize Text

In [9]:
full_text = list(train_df['Phrase'].values) + list(test_df['Phrase'].values)

tk = text.Tokenizer(lower = True, filters='')
tk.fit_on_texts(full_text)
train_tokenized = tk.texts_to_sequences(train_df['Phrase'])
test_tokenized = tk.texts_to_sequences(test_df['Phrase'])


X_train = sequence.pad_sequences(train_tokenized, maxlen = max_len)
X_test = sequence.pad_sequences(test_tokenized, maxlen = max_len)

### Build embedding matrix

In [10]:
w2v = gensim.models.KeyedVectors.load_word2vec_format(pretrained_w2v_path, binary=True).wv

word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = None
    if word in w2v:
        embedding_vector = w2v[word]
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  """Entry point for launching an IPython kernel.


### Encode labels

In [11]:
y_train = train_df['Sentiment']

led = skp.LabelEncoder()
led.fit(y_train.values)

y_train = led.transform(y_train.values)

# Define Keras Model

In [35]:
from dlmslib.keras_models import nlp_models
import tensorflow as tf

from keras import callbacks as kc
from keras import optimizers as ko

## CNN Model

In [36]:
voca_dim = embedding_matrix.shape[0]
time_steps = max_len
output_dim = led.classes_.shape[0]
mlp_dim = 50
num_filters = 5
filter_sizes = [2, 3, 5]
item_embedding = embedding_matrix
mlp_depth = 2

In [37]:
model = nlp_models.build_cnn_model(
    voca_dim, time_steps, output_dim, mlp_dim, num_filters, filter_sizes, 
    item_embedding=item_embedding, mlp_depth=2 
)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input0 (InputLayer)             (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_layer0 (Embedding)    (None, 50, 300)      5843700     input0[0][0]                     
__________________________________________________________________________________________________
conv1d_16 (Conv1D)              (None, 49, 5)        3005        embedding_layer0[0][0]           
__________________________________________________________________________________________________
conv1d_17 (Conv1D)              (None, 48, 5)        4505        embedding_layer0[0][0]           
__________________________________________________________________________________________________
conv1d_18 

In [39]:
adam = ko.Adam()
model.compile(adam, loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy",])

file_path = "best_cnn_model.hdf5"
check_point = kc.ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
early_stop = kc.EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
model.fit(X_train, y_train, batch_size=50, epochs=20, validation_split=0.1, callbacks = [check_point, early_stop])

Train on 140454 samples, validate on 15606 samples
Epoch 1/20


ImportError: `save_model` requires h5py.

## Attention RNN Model

In [None]:
voca_dim = embedding_matrix.shape[0]
time_steps = max_len
output_dim = led.classes_.shape[0]
mlp_dim = 50
num_filters = 5
filter_sizes = [2, 3, 5]
item_embedding = embedding_matrix
mlp_depth = 2

In [37]:
model = nlp_models.build_cnn_model(
    voca_dim, time_steps, output_dim, mlp_dim, num_filters, filter_sizes, 
    item_embedding=item_embedding, mlp_depth=2 
)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input0 (InputLayer)             (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_layer0 (Embedding)    (None, 50, 300)      5843700     input0[0][0]                     
__________________________________________________________________________________________________
conv1d_16 (Conv1D)              (None, 49, 5)        3005        embedding_layer0[0][0]           
__________________________________________________________________________________________________
conv1d_17 (Conv1D)              (None, 48, 5)        4505        embedding_layer0[0][0]           
__________________________________________________________________________________________________
conv1d_18 

In [None]:
adam = ko.Adam()
model.compile(adam, loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy",])

file_path = "best_cnn_model.hdf5"
check_point = kc.ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
early_stop = kc.EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
model.fit(X_train, y_train, batch_size=50, epochs=20, validation_split=0.1, callbacks = [check_point, early_stop])

Train on 140454 samples, validate on 15606 samples
Epoch 1/20

## RNN-CNN Model