In [1]:
import pandas as pd

# Read data from review files 

In [2]:
reviewDataPath = {'yelp': 'data/yelp_labelled.txt',
                 'amazon': 'data/amazon_cells_labelled.txt',
                 'imdb': 'data/imdb_labelled.txt'}
reviewList = []

for source, filepath in reviewDataPath.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    # Add another column filled with the source name
    df['source'] = source 
    reviewList.append(df)

df = pd.concat(reviewList)

In [3]:
review_imdb = df[df['source'] == 'amazon']

In [4]:
review_imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 3 columns):
sentence    1000 non-null object
label       1000 non-null int64
source      1000 non-null object
dtypes: int64(1), object(2)
memory usage: 31.2+ KB


In [5]:
# Just a look at data
print(df.iloc[:10])

                                            sentence  label source
0                           Wow... Loved this place.      1   yelp
1                                 Crust is not good.      0   yelp
2          Not tasty and the texture was just nasty.      0   yelp
3  Stopped by during the late May bank holiday of...      1   yelp
4  The selection on the menu was great and so wer...      1   yelp
5     Now I am getting angry and I want my damn pho.      0   yelp
6              Honeslty it didn't taste THAT fresh.)      0   yelp
7  The potatoes were like rubber and you could te...      0   yelp
8                          The fries were great too.      1   yelp
9                                     A great touch.      1   yelp


# Split data in train and test

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
review_yelp = df[df['source'] == 'yelp']

sentences = review_yelp['sentence'].values

y = review_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000)

In [8]:
sentences_train.size

750

# Tokenize data

In [9]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [10]:
tokenizer = Tokenizer(num_words=5000)

tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)


In [11]:
# Adding 1 because of reserved 0 index
# The indexing is ordered after the most common words in the text, 
# which you can see by the word the having the index 1. 
# It is important to note that the index 0 is reserved 
# and is not assigned to any word. This zero index is used for padding,
# because every statement is not of same size

vocab_size = len(tokenizer.word_index) + 1 

In [12]:
print(vocab_size)

1747


In [13]:
print(sentences_train[1:6])

['Sorry, I will not be getting food from here anytime soon :('
 'Of all the dishes, the salmon was the best, but all were great.'
 'The fries were not hot, and neither was my burger.'
 "In fact I'm going to round up to 4 stars, just because she was so awesome."
 'Will go back next trip out.']


In [14]:
print(X_train[1])
print(X_train[2])
print(X_train[3])
print(X_train[4])

[740, 4, 46, 12, 20, 160, 10, 72, 35, 355, 232]
[11, 43, 1, 171, 1, 283, 3, 1, 47, 26, 43, 24, 22]
[1, 233, 24, 12, 209, 2, 741, 3, 23, 125]
[14, 356, 83, 126, 5, 742, 59, 5, 357, 96, 41, 127, 234, 3, 25, 161]


# PAD Sequance

In [15]:
from keras.preprocessing.sequence import pad_sequences

In [16]:
# maxlen parameter to specify how long the sequences should be. 
#This cuts sequences that exceed that number.

maxlen = 100

In [17]:
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


In [18]:
print(X_train[1])

[740   4  46  12  20 160  10  72  35 355 232   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]


In [19]:
print(X_train[4])

[ 14 356  83 126   5 742  59   5 357  96  41 127 234   3  25 161   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]


# Model Creation

In [20]:
from keras.models import Sequential
from keras import layers

In [21]:
model = Sequential()

In [22]:
# vocab size is 1750 
# input_length is size of review text after tokenization and pad sequance
embedding_dim = 50


model.add(layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           input_length=maxlen))

model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))


model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           87350     
_________________________________________________________________
flatten_1 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                50010     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 137,371
Trainable params: 137,371
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])



In [24]:
history = model.fit(X_train, y_train,
                    epochs=20,verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)



Train on 750 samples, validate on 250 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [25]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))


Training Accuracy: 1.0000


In [26]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))


Testing Accuracy: 0.7840


# Let's do the prediction

In [85]:
import numpy as np
#phrase = "good food ,will come again"
phrase = "food made me angry "

tokens = tokenizer.texts_to_sequences([phrase])
pad_tokens = pad_sequences(tokens, padding='post', maxlen=maxlen)

print(tokens)
print(pad_tokens)

[[10, 100, 75, 1635]]
[[  10  100   75 1635    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]


In [86]:
 val = model.predict_classes(pad_tokens)

In [87]:
print(predictSentiments(val[0][0]))

Customer is gone forever,


In [84]:
def predictSentiments ( indexvalue):
    
    reviewSentiment = ''
    
    if (val[0][0] == 0):
        reviewSentiment = 'Customer is gone forever,'
    else:
       reviewSentiment = 'you got back your customer'

    return reviewSentiment;

# Another model

In [85]:
model2 = Sequential()

model2.add(layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=maxlen))

model2.add(layers.GlobalMaxPool1D())

model2.add(layers.Dense(10, activation='relu'))
model2.add(layers.Dense(1, activation='sigmoid'))
model2.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 100, 50)           87350     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                510       
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
Total params: 87,871
Trainable params: 87,871
Non-trainable params: 0
_________________________________________________________________


In [86]:
history2 = model2.fit(X_train, y_train,
                    epochs=20,verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Train on 750 samples, validate on 250 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [87]:
loss, accuracy = model2.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))



Training Accuracy: 1.0000


In [88]:
loss, accuracy = model2.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))



Testing Accuracy: 0.8160


# Using Pre-Trained GloVe vector

In [94]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    
    vocab_size = len(word_index) + 1 
    # Adding again 1 because of reserved 0 index
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as file:
        for line in file:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                print("{} {} ".format(word,idx))
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix

In [95]:
embedding_dim = 50

filePath = 'GloVe_PreTrained/glove.6B.50d.txt'

embedding_matrix = create_embedding_matrix(filePath,
                                           tokenizer.word_index, 
                                           embedding_dim)

the 1 
of 11 
to 5 
and 2 
in 14 
a 6 
for 13 
that 27 
on 34 
is 7 
was 3 
said 176 
with 21 
he 68 
as 42 
it 8 
by 71 
at 37 
from 72 
his 268 
an 44 
be 20 
has 94 
are 31 
have 30 
but 26 
were 24 
not 12 
this 9 
who 286 
they 33 
had 28 
i 4 
which 70 
will 46 
their 56 
or 81 
its 1448 
one 50 
after 138 
new 313 
been 64 
also 53 
we 17 
would 66 
two 240 
more 123 
first 144 
about 93 
up 59 
when 78 
year 694 
there 48 
all 43 
out 58 
she 234 
other 107 
people 170 
her 263 
than 106 
over 121 
into 837 
last 305 
some 101 
time 40 
you 29 
if 51 
no 62 
can 163 
three 650 
do 204 
only 52 
could 112 
us 67 
so 25 
them 148 
what 63 
him 1566 
during 301 
before 157 
may 401 
since 190 
many 213 
while 217 
where 316 
because 127 
now 136 
made 100 
like 36 
between 1131 
did 69 
just 41 
day 173 
under 441 
such 714 
second 667 
then 159 
company 881 
group 592 
any 134 
four 1113 
being 114 
down 153 
back 32 
off 218 
well 89 
week 1030 
still 167 
both 236 
even 88 
hig

diverse 1057 
subway 451 
touched 1147 
wound 1366 
bread 615 
madison 855 
pepper 1228 
proven 1003 
literally 423 
indoor 1039 
hearts 1163 
joy 1076 
99 1666 
breakfast 165 
forth 989 
waited 207 
impressed 227 
brick 672 
mom 816 
honest 669 
receives 750 
absolute 1198 
somehow 1326 
fails 921 
folks 692 
chocolate 1247 
dirty 292 
avoided 1313 
presentation 540 
whenever 1017 
witnessed 955 
sorry 740 
overhaul 953 
thirty 1638 
butter 1223 
cow 533 
finger 801 
soundtrack 1489 
flower 640 
theft 833 
letting 1116 
egg 366 
duo 1200 
disappointment 324 
array 1192 
loves 1681 
suggestions 745 
fare 587 
item 802 
han 776 
rings 1290 
poorly 1476 
teeth 787 
meal 172 
edinburgh 999 
blown 1604 
sauce 152 
vegetables 354 
highlighted 1207 
cafe 1521 
checked 1255 
priced 374 
juice 1226 
leather 1337 
perfectly 396 
greens 1162 
ingredients 1318 
greeted 565 
recommendation 542 
ignore 1344 
filling 734 
dining 229 
tables 328 
dish 187 
luke 982 
chef 437 
ladies 1661 
meals 390 


nobu 1694 
godfathers 1652 
panna 1461 
trippy 1387 
thats 756 
lox 1106 
ians 954 
overcooked 511 
crema 1299 
omelets 1434 
skimp 1684 
yellowtail 1540 
gringos 1170 
tartare 1049 
moz 1674 
sangria 1393 
refried 753 
gristle 1745 
boba 1308 
gyros 589 
bruschetta 813 
5lb 1743 
tiramisu 747 
doughy 966 
carpaccio 1541 
callings 1187 
baklava 1711 
tater 1068 
yucky 773 
flavorless 967 
anyways 1656 
spicier 1351 
huevos 1670 
cannoli 748 
brushfire 1419 
outshining 984 
rancheros 1671 
soooo 785 
bouchon 1493 
lordy 1278 
choux 806 
fav 1444 
mmmm 1589 
bachi 634 
ohhh 1044 
wagyu 1053 
ribeye 944 
rge 1591 
wontons 1583 
sooooo 1182 
blandest 795 
crêpe 1563 
heimer 1303 
unexperienced 1024 
relleno 1593 
40min 1130 
steiners 1601 
kiddos 1082 
eew 951 
definately 693 
untoasted 1011 
delicioso 1016 
sause 1180 
sals 1345 


In [100]:
print(embedding_matrix[0:2])

[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 4.18000013e-01  2.49679998e-01 -4.12420005e-01  1.21699996e-01
   3.45270008e-01 -4.44569997e-02 -4.96879995e-01 -1.78619996e-01
  -6.60229998e-04 -6.56599998e-01  2.7843

In [113]:
model3 = Sequential()

model3.add(layers.Embedding(vocab_size, 
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=True)) # Make it False
#model3.add(layers.Conv1D(128, 5, activation='relu'))
model3.add(layers.GlobalMaxPool1D())

model3.add(layers.Dense(10, activation='relu'))
model3.add(layers.Dense(1, activation='sigmoid'))
model3.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 100, 50)           87350     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 96, 128)           32128     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 11        
Total params: 120,779
Trainable params: 120,779
Non-trainable params: 0
_________________________________________________________________


In [114]:
history3 = model3.fit(X_train, y_train,
                    epochs=20,verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Train on 750 samples, validate on 250 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [115]:
loss, accuracy = model3.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

Training Accuracy: 1.0000


In [116]:
loss, accuracy = model3.evaluate(X_test, y_test, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))


Training Accuracy: 0.7840
