In [1]:
import pandas as pd

# Read data from review files 

In [56]:
reviewDataPath = {'yelp': 'data/yelp_labelled.txt',
                 'amazon': 'data/amazon_cells_labelled.txt',
                 'imdb': 'data/imdb_labelled.txt'}
reviewList = []

for source, filepath in reviewDataPath.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    # Add another column filled with the source name
    df['source'] = source 
    reviewList.append(df)

df = pd.concat(reviewList)

In [57]:
review_imdb = df[df['source'] == 'amazon']

In [58]:
review_imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 3 columns):
sentence    1000 non-null object
label       1000 non-null int64
source      1000 non-null object
dtypes: int64(1), object(2)
memory usage: 31.2+ KB


In [59]:
# Just a look at data
print(df.iloc[:10])

                                            sentence  label source
0                           Wow... Loved this place.      1   yelp
1                                 Crust is not good.      0   yelp
2          Not tasty and the texture was just nasty.      0   yelp
3  Stopped by during the late May bank holiday of...      1   yelp
4  The selection on the menu was great and so wer...      1   yelp
5     Now I am getting angry and I want my damn pho.      0   yelp
6              Honeslty it didn't taste THAT fresh.)      0   yelp
7  The potatoes were like rubber and you could te...      0   yelp
8                          The fries were great too.      1   yelp
9                                     A great touch.      1   yelp


# Split data in train and test

In [60]:
from sklearn.model_selection import train_test_split

In [61]:
review_yelp = df[df['source'] == 'yelp']

sentences = review_yelp['sentence'].values

y = review_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000)

In [62]:
sentences_train.size

750

# Tokenize data

In [63]:
from keras.preprocessing.text import Tokenizer

In [64]:
tokenizer = Tokenizer(num_words=5000)

tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)


In [65]:
# Adding 1 because of reserved 0 index
# The indexing is ordered after the most common words in the text, 
# which you can see by the word the having the index 1. 
# It is important to note that the index 0 is reserved 
# and is not assigned to any word. This zero index is used for padding,
# because every statement is not of same size

vocab_size = len(tokenizer.word_index) + 1 

In [66]:
print(vocab_size)

1747


In [67]:
vocab_size = len(tokenizer.word_index) + 1

In [68]:
print(sentences_train[1:6])

['Sorry, I will not be getting food from here anytime soon :('
 'Of all the dishes, the salmon was the best, but all were great.'
 'The fries were not hot, and neither was my burger.'
 "In fact I'm going to round up to 4 stars, just because she was so awesome."
 'Will go back next trip out.']


In [69]:
print(X_train[1])
print(X_train[2])
print(X_train[3])
print(X_train[4])

[740, 4, 46, 12, 20, 160, 10, 72, 35, 355, 232]
[11, 43, 1, 171, 1, 283, 3, 1, 47, 26, 43, 24, 22]
[1, 233, 24, 12, 209, 2, 741, 3, 23, 125]
[14, 356, 83, 126, 5, 742, 59, 5, 357, 96, 41, 127, 234, 3, 25, 161]


# PAD Sequance

In [70]:
from keras.preprocessing.sequence import pad_sequences

In [71]:
# maxlen parameter to specify how long the sequences should be. 
#This cuts sequences that exceed that number.

maxlen = 100

In [72]:
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


In [73]:
print(X_train[1])

[740   4  46  12  20 160  10  72  35 355 232   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]


In [74]:
print(X_train[4])

[ 14 356  83 126   5 742  59   5 357  96  41 127 234   3  25 161   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]


# Model Creation

In [75]:
from keras.models import Sequential
from keras import layers

In [76]:
model = Sequential()

In [77]:
# vocab size is 1750 
# input_length is size of review text after tokenization and pad sequance
embedding_dim = 50


model.add(layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           input_length=maxlen))

model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))


model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 50)           87350     
_________________________________________________________________
flatten_2 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                50010     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
Total params: 137,371
Trainable params: 137,371
Non-trainable params: 0
_________________________________________________________________


In [78]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])



In [79]:
history = model.fit(X_train, y_train,
                    epochs=20,verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)



Train on 750 samples, validate on 250 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [80]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))


Training Accuracy: 1.0000


In [81]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))


Testing Accuracy: 0.6800


# Let's do the prediction

In [92]:
import numpy as np
phrase = "good food ,will come again"
#phrase = "bad service"

tokens = tokenizer.texts_to_sequences([phrase])
pad_tokens = pad_sequences(tokens, padding='post', maxlen=maxlen)

print(tokens)
print(pad_tokens)

[[16, 10, 46, 191, 76]]
[[ 16  10  46 191  76   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]


In [93]:
val = model.predict_classes(pad_tokens)   

In [94]:
def predictSentiments ( indexvalue):
    
    reviewSentiment = ''
    
    if (val[0][0] == 0):
        reviewSentiment = 'Customer is gone forever,'
    else:
        reviewSentiment = 'you got back your customer'

    return reviewSentiment;

In [95]:
print(predictSentiments(val[0][0]))

you got back your customer


# Save the model to re-use later

In [98]:
from keras.models import load_model
import pickle

# Creates a HDF5 file 'my_model.h5'
model.save('my_model.h5')

# Deletes the existing model
#del model  


# saving tokenizer 
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)



# Load Model and tokenizer

In [100]:
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer_saved = pickle.load(handle)

# Returns a compiled model identical to the previous one
model_saved = load_model('my_model.h5')

In [104]:
#review_sen = "good food ,will come again"
review_sen = "bad service"

tokens_sen = tokenizer_saved.texts_to_sequences([review_sen])
pad_tokens_sen = pad_sequences(tokens_sen, padding='post', maxlen=maxlen)

print(tokens_sen)
print(pad_tokens_sen)

[[90, 19]]
[[90 19  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]]


In [105]:
val = model.predict_classes(pad_tokens_sen)
print(predictSentiments(val[0][0]))

Customer is gone forever,


# Another model

In [None]:
model2 = Sequential()

model2.add(layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=maxlen))

model2.add(layers.GlobalMaxPool1D())

model2.add(layers.Dense(10, activation='relu'))
model2.add(layers.Dense(1, activation='sigmoid'))
model2.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model2.summary()

In [None]:
history2 = model2.fit(X_train, y_train,
                    epochs=20,verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

In [None]:
loss, accuracy = model2.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))



In [None]:
loss, accuracy = model2.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))



# Using Pre-Trained GloVe vector

In [None]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    
    vocab_size = len(word_index) + 1 
    # Adding again 1 because of reserved 0 index
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as file:
        for line in file:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                print("{} {} ".format(word,idx))
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix

In [None]:
embedding_dim = 50

filePath = 'GloVe_PreTrained/glove.6B.50d.txt'

embedding_matrix = create_embedding_matrix(filePath,
                                           tokenizer.word_index, 
                                           embedding_dim)

In [None]:
print(embedding_matrix[0:2])

In [None]:
model3 = Sequential()

model3.add(layers.Embedding(vocab_size, 
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=True)) # Make it False
#model3.add(layers.Conv1D(128, 5, activation='relu'))
model3.add(layers.GlobalMaxPool1D())

model3.add(layers.Dense(10, activation='relu'))
model3.add(layers.Dense(1, activation='sigmoid'))
model3.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model3.summary()

In [None]:
history3 = model3.fit(X_train, y_train,
                    epochs=20,verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

In [None]:
loss, accuracy = model3.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

In [None]:
loss, accuracy = model3.evaluate(X_test, y_test, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
