# IMDb Sentiment Analysis

In [124]:
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
import csv

# Tokenize, stem and remove stopwords from the combined data
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from string import punctuation

# Creating MLP
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM
from keras.utils import np_utils
from keras.preprocessing import sequence

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jeanb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [125]:
max_features = 200000
maxlen = 80
batch_size = 32

train = pd.read_csv("C:/Users/jeanb/train.tsv",sep="\t")
test = pd.read_csv("C:/Users/jeanb/test.tsv",sep="\t")
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [126]:
# combining the data into a single unit for processing of features
features_train = train['Phrase']
labels_train = train['Sentiment']
features_test = test['Phrase']
combined = features_train.append(features_test).values

train_length = len(features_train.values)
test_length = len(features_test.values)

combined_features=[]

In [127]:
# stopwords
nltk.download('stopwords')
stop_words = list(set(stopwords.words('english')))
punc=list(set(punctuation))
stop_words.extend(punc)
stop_words.extend(["'s", "'d", "'m"])
#print(stop_words)

for x in combined:
    x=word_tokenize(x)
    stemmer=SnowballStemmer('english')
    x=[(stemmer.stem(i)).lower() for i in x]
    x=[i for i in x if x not in stop_words]
    combined_features.append(x)


# mapping frequencies with words
from gensim import corpora
dictionary = corpora.Dictionary(combined_features)
#print(dictionary)

id=[]
for x in combined_features:
    temp = [dictionary.token2id[j] for j in x]
    id.append(temp)

# using gpu to increase computation speed
with tf.device('/gpu:0'):
    model=Sequential()
    model.add(Embedding(max_features, 128))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(5, activation='softmax'))

    # padding the input to ensure a fixed size input to the network
    x_train=sequence.pad_sequences(np.array(id[:train_length]))
    x_test=sequence.pad_sequences(np.array(id[train_length:]))

    # one hot encoding
    y_train=np_utils.to_categorical(labels_train)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(x_train, y_train,batch_size=batch_size,epochs=10,validation_split=0.1)

    preds = model.predict_classes(x_test, verbose=0)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jeanb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Train on 140454 samples, validate on 15606 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [128]:
def write_preds(preds, fname):
    pd.DataFrame({"PhraseID": test['PhraseId'],"Sentiment": preds}).to_csv(fname, index=False, header=True)
        
write_preds(preds, "C:/Users/jeanb/result-7.csv")

My model scores around **0.630** when submitted on the link of the competition (https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews).