In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords 

from numpy import array 
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
df = pd.read_csv('reviews.csv')
df.isnull().values.any() # checks if data file have any null values
df.head()

Unnamed: 0,reviews,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [3]:
def preprocess(text):
  text = re.sub('<[^>]*>', '', text)
  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
  text = re.sub('[\W]+', ' ', text.lower()) +\
      ' '.join(emoticons).replace('-', '')
  return text


In [4]:
X = []
sentences = list(df['reviews'])
for sen in sentences:
    X.append(preprocess(sen))
y = df['sentiment']

# Train( 80 ) Test( 20 ) Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Embedding Layer ( For converting textual data into numeric data )

* First layer in DL model in Keras

In [6]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
print(len(X_train[1]))
print(len(X_test[1]))

158
233


In [7]:
# padding process 
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [8]:
print(len(X_train[1]))
print(len(X_test[1]))

100
100


In [9]:
print(vocab_size) # number of unique words in dataset

94320


In [10]:
# using GloVe for creating feature matrix
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [11]:
embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [12]:
len(embedding_matrix)

94320

# Text Classification with CNN

In [13]:
from keras.layers import LSTM
model = Sequential()
model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False))
model.add(LSTM(200, dropout_U = 0.2, dropout_W = 0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])

  after removing the cwd from sys.path.


In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          9432000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               240800    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 9,673,001
Trainable params: 241,001
Non-trainable params: 9,432,000
_________________________________________________________________


In [15]:
model.fit(X_train, y_train, epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x26eade90be0>

In [16]:
model.save("Senti.h5py")

In [17]:
x='is the rare movie that succeeds on almost every level, where each character, scene, costume, and joke firing on all cylinders to make a film worth repeated viewings.'

In [18]:
X = []
X.append(preprocess(x))

In [19]:
X = tokenizer.texts_to_sequences(X)

In [20]:
X=pad_sequences(X, padding='post', maxlen=maxlen)

In [21]:
model.predict_proba(X)

array([[0.5952437]], dtype=float32)

In [22]:
model.predict_classes(X)

array([[1]])