In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix
import re
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
data = pd.read_csv("/content/drive/MyDrive/capstone/data/amazon_reviews.csv",encoding="ISO-8859-1")
data.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...


# Data cleaning

In [10]:
lemmatizer = WordNetLemmatizer()
stop_words= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't","not"])

In [11]:
def preprocess(text):
  text=text.lower()
  text = [word for word in text.split(" ") if not word in stop_words] # removing stop words
  text= [lemmatizer.lemmatize(token, "v") for token in text] #Lemmatization
  text=" ".join(text) 
  return text
def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

## Pre-processing

In [12]:
data["REVIEW_TEXT"]=data["REVIEW_TEXT"].apply(lambda x:decontract(x))
data["REVIEW_TEXT"]=data["REVIEW_TEXT"].apply(lambda x:preprocess(x))
data.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"least think so, product save day. keep around ..."
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,lithium batteries something new introduce mark...
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,purchase swing baby. 6 months pretty much grow...
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,look inexpensive desk calcolatur is. work ever...
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,use twice week result great. use teeth whiten ...


In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
le=LabelEncoder()

In [15]:
data["LABEL"] = le.fit_transform(data["LABEL"])


In [16]:
data # 0 - real,  1 - fake

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,0,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"least think so, product save day. keep around ..."
1,2,0,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,lithium batteries something new introduce mark...
2,3,0,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,purchase swing baby. 6 months pretty much grow...
3,4,0,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,look inexpensive desk calcolatur is. work ever...
4,5,0,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,use twice week result great. use teeth whiten ...
...,...,...,...,...,...,...,...,...,...
20995,20996,1,4,Y,Shoes,B00BXYM8T8,"Madden Girl Women's Gettaw Pump,Red Patent,7.5...",wide width is great!,"buy work. high arches, use arch support. hee..."
20996,20997,1,4,Y,Shoes,B0014C2ORK,"crocs Unisex Classic Clog,Khaki,6 US Men's / 8...",Love crocs!,crocs one two brand shoe feet day work! love c...
20997,20998,1,5,Y,Shoes,B000EX8CCQ,Minnetonka Men's 703 Leather Laced Softsole Mo...,I love moccasins This fit like it was custom m...,love moccasins fit like custom make me.<br />...
20998,20999,1,5,Y,Shoes,B00748YHVE,Ariat Womens Unbridled Fatbaby 9 B Powder Brown,"This fit well, comfortable, best investment",wish little durable. get catch bolt cross bunk...


In [17]:
X_train, X_test, y_train, y_test = train_test_split(data['REVIEW_TEXT'], data['LABEL'], test_size=0.3, random_state=1,shuffle =True)

In [18]:
X_train

5321     buy headband daughter. love wear it. cute desi...
15021    need remotes, want newest ones game like resor...
12559    excellent put two car seat 2014 honda odyssey....
15807    impress product notice benefit product promise...
15320    last year daughter bunny easter contest. purch...
                               ...                        
10955    several kensington products please them.  case...
17289    quality leather strap great price. use strap p...
5192     take level skill become efficient governmental...
12172    exceptionally quick delivery.  great price com...
235      nice rapid charge case compact add much bulk. ...
Name: REVIEW_TEXT, Length: 14700, dtype: object

## Preprocessing

In [19]:
from keras_preprocessing.sequence import pad_sequences

In [20]:
top_words = 6000    #top 6000 words in the vocabulary
max_review_length = 130  #maximum sentence length
embedding_vector_length = 32  #each word is mapped to a 32 dimensional vector
tokenizer = Tokenizer(num_words=top_words) # top 6000 are getting tokenized
tokenizer.fit_on_texts(X_train)
list_tokenized_train = tokenizer.texts_to_sequences(X_train) # text data is converted to sequence data

In [21]:
X_train_pad = pad_sequences(list_tokenized_train, maxlen=max_review_length)  # making all input sequence into same length

#### One-Hot encoding

In [22]:
from tensorflow.keras.preprocessing.text import one_hot

In [23]:
vectorizer1 = TfidfVectorizer(ngram_range = (1,1))
vectorizer1.fit(X_train)
train_vectors=vectorizer1.transform(X_train)
train_vectors

<14700x26781 sparse matrix of type '<class 'numpy.float64'>'
	with 442258 stored elements in Compressed Sparse Row format>

In [24]:
train_vectors.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
test_vectors=vectorizer1.transform(X_test)

## Bi-LSTM

In [26]:
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

In [27]:
model = Sequential()  
model.add(Embedding(top_words+1, embedding_vector_length, input_length=max_review_length)) 
model.add(Bidirectional(LSTM(100)))
model.add(Dense(1, activation='relu'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           192032    
                                                                 
 bidirectional (Bidirectiona  (None, 200)              106400    
 l)                                                              
                                                                 
 dense (Dense)               (None, 1)                 201       
                                                                 
Total params: 298,633
Trainable params: 298,633
Non-trainable params: 0
_________________________________________________________________


In [28]:
history = model.fit(X_train_pad,y_train, epochs=30, batch_size=64)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [29]:
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(list_tokenized_test, maxlen=max_review_length)
prediction = model.predict(X_test_pad)
y_pred = (prediction > 0.5)
print("Accuracy of the model : ", accuracy_score(y_pred, y_test))
print('F1-score: ', f1_score(y_pred, y_test))
print('Confusion matrix:')
confusion_matrix(y_test,y_pred)

Accuracy of the model :  0.5861904761904762
F1-score:  0.5786326167771134
Confusion matrix:


array([[1903, 1312],
       [1295, 1790]])

In [30]:
from sklearn.metrics import classification_report

In [31]:
 print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.60      0.59      0.59      3215
           1       0.58      0.58      0.58      3085

    accuracy                           0.59      6300
   macro avg       0.59      0.59      0.59      6300
weighted avg       0.59      0.59      0.59      6300

