In [1]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import SnowballStemmer,WordNetLemmatizer
import os
import gc
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
% matplotlib inline

Using TensorFlow backend.


In [2]:
data_train = pd.read_csv('train.tsv',sep = '\t')
data_test = pd.read_csv('test.tsv',sep = '\t')


In [3]:
data_train.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [4]:
print(data_train.shape,data_test.shape)

(156060, 4) (66292, 3)


In [5]:
#merging train and test for cleaning purpose

data_test['Sentiment'] = -1
toclean = pd.concat([data_train,data_test],ignore_index = True,sort = False)
del data_train,data_test
gc.collect()

18

In [6]:
#cleaning
import re
stemmer = SnowballStemmer('english')
lemma = WordNetLemmatizer()


def clean_review(reviews):
    corpus = []
    for i in range(0,len(reviews)):
        review = str(reviews[i]) 
        review = re.sub('[^a-zA-Z]',' ',review)
        review = [lemma.lemmatize(w,'v') for w in word_tokenize(review) if(len(w)>1)]
        review = ' '.join(review)
        corpus.append(review)
        
    return(corpus)


In [7]:
toclean['clean_review'] = clean_review(toclean.Phrase.values)
toclean.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review
0,1,1,A series of escapades demonstrating the adage ...,1,series of escapades demonstrate the adage that...
1,2,1,A series of escapades demonstrating the adage ...,2,series of escapades demonstrate the adage that...
2,3,1,A series,2,series
3,4,1,A,2,
4,5,1,series,2,series
5,6,1,of escapades demonstrating the adage that what...,2,of escapades demonstrate the adage that what b...
6,7,1,of,2,of
7,8,1,escapades demonstrating the adage that what is...,2,escapades demonstrate the adage that what be g...
8,9,1,escapades,2,escapades
9,10,1,demonstrating the adage that what is good for ...,2,demonstrate the adage that what be good for th...


In [8]:
train = toclean[toclean['Sentiment']!=-1]
test = toclean[toclean['Sentiment']==-1]
y = train['Sentiment'].values
print(train.shape,test.shape,y.shape)

(156060, 5) (66292, 5) (156060,)


In [9]:
train_text = train['clean_review'].values
test_text = test['clean_review'].values
y = to_categorical(y)

In [10]:
X_train_text,X_val_text,y_train,y_val=train_test_split(train_text,y,test_size=0.1,stratify=y,random_state=123)
print(X_train_text.shape,y_train.shape)
print(X_val_text.shape,y_val.shape)

(140454,) (140454, 5)
(15606,) (15606, 5)


In [11]:
#number of unique words
words = ' '.join(X_train_text)
words = word_tokenize(words)
freq = FreqDist(words)
unique_words = len(freq)
print(unique_words)

14273


In [12]:
#review with maximum words
length = []
for review in X_train_text:
    word = word_tokenize(review)
    l = len(word)
    length.append(l)
m = np.max(length)
print(m)
    

47


In [13]:
#model parameters
batch_size = 128
max_features = unique_words
max_length = m+2
epochs = 4
num_classes =5

In [14]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train_text))
X_train = tokenizer.texts_to_sequences(X_train_text)
X_val = tokenizer.texts_to_sequences(X_val_text)
X_test = tokenizer.texts_to_sequences(test_text)

X_train = sequence.pad_sequences(X_train, maxlen=max_length)
X_val = sequence.pad_sequences(X_val, maxlen=max_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_length)
print(X_train.shape,X_val.shape,X_test.shape)

(140454, 49) (15606, 49) (66292, 49)


In [15]:
model = Sequential()
model.add(Embedding(max_features,250,mask_zero=True))
model.add(LSTM(128,return_sequences=True))
model.add(LSTM(64,return_sequences=False))
model.add(Dense(num_classes,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 250)         3568250   
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 128)         194048    
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 325       
Total params: 3,812,031
Trainable params: 3,812,031
Non-trainable params: 0
_________________________________________________________________


In [16]:
log = model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs = epochs,verbose=2,batch_size=batch_size)

Train on 140454 samples, validate on 15606 samples
Epoch 1/4
 - 283s - loss: 0.9657 - acc: 0.6159 - val_loss: 0.8399 - val_acc: 0.6519
Epoch 2/4
 - 270s - loss: 0.7770 - acc: 0.6788 - val_loss: 0.8017 - val_acc: 0.6722
Epoch 3/4
 - 269s - loss: 0.6996 - acc: 0.7071 - val_loss: 0.8064 - val_acc: 0.6741
Epoch 4/4
 - 269s - loss: 0.6431 - acc: 0.7264 - val_loss: 0.8198 - val_acc: 0.6768
