In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip
/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip
/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv
/kaggle/input/movie-review-dataset/train.tsv


In [2]:
#Importing all the required packages

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from keras.utils import to_categorical
import nltk
from  tensorflow.keras.preprocessing.text import Tokenizer
import nltk
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.optimizers import Adam

In [3]:
#Read the tab-separated file

train=pd.read_csv('../input/movie-review-dataset/train.tsv', index_col='PhraseId', sep="\t")

In [4]:
#Drop the dependent variable sentiment

train=train.drop(['SentenceId'], axis=1)

In [5]:
#Initialize the count vectorizer

cv= CountVectorizer()

In [6]:
#Function for Preprocessing the reviews by removing unwanted words

def review_clean(df):
    
    reviews=[]
    lemmatizer = WordNetLemmatizer() 
    
    for text in df['Phrase']:
        text = re.sub("[^a-zA-Z]"," ", text)
        token_text= word_tokenize(text.lower())
        clean_text= [ lemmatizer.lemmatize(i)  for i in token_text]
        reviews.append(clean_text)
    return reviews


All words in the review are turned into lower case and special characters are removed.

Then the words are lemmatized.

**Lemmatization** usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma

In [7]:
#Cleaning and splitting of dataset

y=train.Sentiment.values
X=train.drop(['Sentiment'], axis=1)
nltk.download('punkt')
nltk.download('wordnet')
train_tweets=review_clean(X)

y_target=to_categorical(y)
num_classes=y_target.shape[1]

X_train, X_test, y_train, y_test = train_test_split(train_tweets, y_target, test_size = 0.2, stratify=y_target)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
#Finding Max length and Total Words

unique_words = set()
len_max = 0

for sent in (X_train):
    
    unique_words.update(sent)
    
    if(len_max<len(sent)):
        len_max = len(sent)
        
print(len(list(unique_words)))
print(len_max)

13736
48


In [9]:
#Tokenization the words in the reviews

tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(X_train))

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [10]:
#Padding the tokenized words

X_train = pad_sequences(X_train, maxlen=len_max)
X_test = pad_sequences(X_test, maxlen=len_max)

In [11]:
#Early Stopping

early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max', monitor='val_acc', patience = 2)
callback = [early_stopping]

In [12]:
#Creating the model

model=Sequential()
model.add(Embedding(len(list(unique_words)),300,input_length=len_max))
model.add(LSTM(128,dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
model.add(LSTM(64,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model.add(Dense(100,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.005),metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 48, 300)           4120800   
_________________________________________________________________
lstm (LSTM)                  (None, 48, 128)           219648    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 100)               6500      
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 505       
Total params: 4,396,861
Trainable params: 4,396,861
Non-trainable params: 0
______________________________________________

In [13]:
#Fitting the model

history=model.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=6, batch_size=256, verbose=1, callbacks=callback)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [14]:
#Predicting the sentiment

y_pred=model.predict_classes(X_test)

In [16]:
print(y_pred)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices