# Sentiment classifier - Karis Gwet

# Import data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("amazon-reviews.csv", encoding="latin-1")

In [3]:
df.head()

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tr...,__label__2
1,The best soundtrack ever to anything.: I'm re...,__label__2
2,Amazing!: This soundtrack is my favorite musi...,__label__2
3,Excellent Soundtrack: I truly like this sound...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After H...",__label__2


In [4]:
X = df.text
y = df.label

In [5]:
X.shape

(10000,)

In [6]:
y.shape

(10000,)

On va changer nos labels en 0 et 1, pour que ce soit plus facilement utilisable dans le réseau de neurones

In [7]:
for i in range(0,len(df["label"])):
    if df["label"][i] == "__label__1 ":
        df["label"][i] = 0
    else:
        df["label"][i] = 1
        
#df.loc[df.label=="__label__2 ","label"]=1

In [8]:
df.shape

(10000, 2)

In [9]:
df.head(10)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tr...,1
1,The best soundtrack ever to anything.: I'm re...,1
2,Amazing!: This soundtrack is my favorite musi...,1
3,Excellent Soundtrack: I truly like this sound...,1
4,"Remember, Pull Your Jaw Off The Floor After H...",1
5,an absolute masterpiece: I am quite sure any ...,1
6,"Buyer beware: This is a self-published book, ...",0
7,Glorious story: I loved Whisper of the wicked...,1
8,A FIVE STAR BOOK: I just finished reading Whi...,1
9,Whispers of the Wicked Saints: This was a eas...,1


# Import neural network libraries

In [10]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

# Preprocessing

In [11]:
import nltk
import re
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer

In [12]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Karis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
ps = PorterStemmer()
corpus = []
for i in range(0,len(df["text"])):
    sentence = re.sub('[^a-zA-Z]', ' ', df["text"][i])
    sentence = sentence.lower()
    sentence = sentence.split()
    
    sentence = [ps.stem(word) for word in sentence if not word in stopwords.words("english")]
    sentence = ' '.join(sentence)
    corpus.append(sentence)

In [14]:
re.sub('[^a-zA-Z]', ' ', "I am run4ning")

'I am run ning'

In [15]:
len(corpus)

10000

### One hot representation part 2

In [16]:
# Vocabulary size
voc_size = 5000 #number of words in my dictionary
onehot_rep = [one_hot(words,voc_size) for words in corpus]
onehot_rep

[[4269,
  2050,
  3218,
  792,
  2850,
  413,
  4086,
  4561,
  664,
  4828,
  3091,
  2238,
  3511,
  2050,
  1100,
  1811,
  2042,
  316,
  3018,
  3187,
  316,
  1739,
  4769,
  316,
  4536,
  3187,
  2475,
  3018,
  2059,
  4288,
  4143,
  2807,
  2539,
  587,
  2306,
  430,
  3292,
  4542,
  159,
  2238,
  4581,
  4411,
  1418,
  4207],
 [2475,
  1325,
  4536,
  2206,
  3949,
  3131,
  2696,
  114,
  2475,
  316,
  1325,
  3681,
  2855,
  2696,
  3631,
  2982,
  2984,
  1463,
  1170,
  1185,
  4538,
  3018,
  3880,
  4207,
  1136,
  4086,
  4862,
  3219,
  1856,
  713,
  1382,
  4076,
  565,
  133,
  114,
  332,
  902,
  193,
  2741,
  2375,
  2538,
  1535,
  2238,
  4050,
  989,
  1033],
 [3151,
  1325,
  1591,
  3018,
  3014,
  3778,
  848,
  111,
  845,
  4052,
  727,
  3187,
  316,
  2498,
  3437,
  2108,
  2007,
  1976,
  3396,
  3255,
  833,
  3382,
  860,
  1145,
  1136,
  4929,
  3741,
  413,
  2468,
  1739,
  4769,
  3014,
  3711,
  3014,
  4303,
  75,
  3138,
  2967,
  1

In [17]:
len(onehot_rep)

10000

### Embedding representation

In [18]:
sent_length = 90 #to have our sequence with the same length
embedded_docs = pad_sequences(onehot_rep, padding='pre', maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 4411 1418 4207]
 [   0    0    0 ... 4050  989 1033]
 [   0    0    0 ... 4695 2440 3396]
 ...
 [   0    0    0 ... 3505 2553 1680]
 [   0    0    0 ... 1611 1776  902]
 [   0    0    0 ... 4623 4033 2111]]


In [19]:
len(embedded_docs)

10000

In [20]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0, 4269, 2050, 3218,  792, 2850,  413, 4086, 4561,  664,
       4828, 3091, 2238, 3511, 2050, 1100, 1811, 2042,  316, 3018, 3187,
        316, 1739, 4769,  316, 4536, 3187, 2475, 3018, 2059, 4288, 4143,
       2807, 2539,  587, 2306,  430, 3292, 4542,  159, 2238, 4581, 4411,
       1418, 4207])

# Model

In [21]:
embedding_vector_features = 40 #dimension/number of features
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length = sent_length)) #output = embedding matrix
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 90, 40)            200000    
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________


In [22]:
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [23]:
model2 = Sequential()
model2.add(Embedding(voc_size, embedding_vector_features, input_length = sent_length))
model2.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model2.add(Dropout(0.25))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Flatten())
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 90, 40)            200000    
                                                                 
 conv1d (Conv1D)             (None, 88, 64)            7744      
                                                                 
 dropout (Dropout)           (None, 88, 64)            0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 44, 64)           0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 2816)              0         
                                                                 
 dense_1 (Dense)             (None, 1)                 2817      
                                                      

In [24]:
len(embedded_docs),y.shape

(10000, (10000,))

In [25]:
import numpy as np

X = np.array(embedded_docs).astype(np.float32)
y = np.array(y).astype(np.float32)

In [26]:
print(X.shape)
print(y.shape)

(10000, 90)
(10000,)


In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

# Train

In [28]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x28d14ab8f70>

In [29]:
model2.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x28d78b6dca0>

Here we can see some overfitting. But the point of the exercice was to show how we can classify with word embedding

In [30]:
y_pred = model.predict(X_test)
y_pred

array([[3.1698414e-03],
       [3.7132594e-04],
       [7.2981156e-03],
       ...,
       [9.9924302e-01],
       [4.4493279e-01],
       [2.5517060e-04]], dtype=float32)

In [31]:
y_test

array([0., 0., 0., ..., 1., 0., 1.], dtype=float32)

In [32]:
model.evaluate(X_test, y_test)



[0.8962670564651489, 0.7864999771118164]

In [33]:
y2_pred = model2.predict(X_test)
y2_pred

array([[9.7512277e-03],
       [4.3696255e-06],
       [5.6754626e-02],
       ...,
       [9.9999702e-01],
       [1.4759985e-01],
       [1.2766809e-04]], dtype=float32)

In [34]:
model2.evaluate(X_test, y_test)



[0.9444425702095032, 0.8069999814033508]

By doing with a 1D convolution network, we find better results than with LSTM recurrent neural networks