In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix
import re
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
data = pd.read_csv("/content/drive/MyDrive/capstone/data/deceptive-opinion.csv")
data.head()

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


# Data cleaning

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't","not"])

In [None]:
def preprocess(text):
  text=text.lower()
  text = [word for word in text.split(" ") if not word in stop_words] # removing stop words
  text= [lemmatizer.lemmatize(token, "v") for token in text] #Lemmatization
  text=" ".join(text) 
  return text
def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

## Pre-processing

In [None]:
data["text"]=data["text"].apply(lambda x:decontract(x))
data["text"]=data["text"].apply(lambda x:preprocess(x))
data.head()

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,stay one night getaway family thursday. triple...
1,truthful,hyatt,positive,TripAdvisor,triple rate upgrade view room less $200 also i...
2,truthful,hyatt,positive,TripAdvisor,come little late finally catch review past sev...
3,truthful,omni,positive,TripAdvisor,"omni chicago really deliver fronts, spaciousne..."
4,truthful,hyatt,positive,TripAdvisor,ask high floor away elevator got. room pleasan...


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()

In [None]:
data["deceptive"] = le.fit_transform(data["deceptive"])


In [None]:
data # 1 - real,  0 - fake

Unnamed: 0,deceptive,hotel,polarity,source,text
0,1,conrad,positive,TripAdvisor,stay one night getaway family thursday. triple...
1,1,hyatt,positive,TripAdvisor,triple rate upgrade view room less $200 also i...
2,1,hyatt,positive,TripAdvisor,come little late finally catch review past sev...
3,1,omni,positive,TripAdvisor,"omni chicago really deliver fronts, spaciousne..."
4,1,hyatt,positive,TripAdvisor,ask high floor away elevator got. room pleasan...
...,...,...,...,...,...
1595,0,intercontinental,negative,MTurk,problems start book intercontinental chicago o...
1596,0,amalfi,negative,MTurk,amalfi hotel beautiful website interior decora...
1597,0,intercontinental,negative,MTurk,intercontinental chicago magnificent mile outs...
1598,0,palmer,negative,MTurk,"palmer house hilton, look good pictures, outsi..."


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['deceptive'], test_size=0.3, random_state=1,shuffle =True)

In [None]:
X_train

126     daughter chicago one night attend three-day co...
810     outmode wear furnish combine poor original des...
635     husband arrive 3 night stay 10th wed anniversa...
598     although much overprice opinion, hotel spotles...
880     recently stay sheraton onsite conference, prob...
                              ...                        
715     chicago hilton great hotel stay fantastic. hot...
905     even though review hotel good, good thing loca...
1096    stay jam times, go hotel chicago. last trip ma...
235     ambassador east awesome hotel!!! understand ho...
1061    stay enthusiastic positive review trip advisor...
Name: text, Length: 1120, dtype: object

## Preprocessing

In [None]:
from keras_preprocessing.sequence import pad_sequences

In [None]:
top_words = 6000    #top 6000 words in the vocabulary
max_review_length = 130  #maximum sentence length
embedding_vector_length = 32  #each word is mapped to a 32 dimensional vector
tokenizer = Tokenizer(num_words=top_words) # top 6000 are getting tokenized
tokenizer.fit_on_texts(X_train)
list_tokenized_train = tokenizer.texts_to_sequences(X_train) # text data is converted to sequence data

In [None]:
X_train_pad = pad_sequences(list_tokenized_train, maxlen=max_review_length)  # making all input sequence into same length

#### One-Hot encoding

In [None]:
from tensorflow.keras.preprocessing.text import one_hot

In [None]:
vectorizer1 = TfidfVectorizer(ngram_range = (1,1))
vectorizer1.fit(X_train)
train_vectors=vectorizer1.transform(X_train)
train_vectors

<1120x7395 sparse matrix of type '<class 'numpy.float64'>'
	with 71901 stored elements in Compressed Sparse Row format>

In [None]:
train_vectors.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
test_vectors=vectorizer1.transform(X_test)

## Bi-LSTM

In [None]:
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
model = Sequential()  
model.add(Embedding(top_words+1, embedding_vector_length, input_length=max_review_length)) 
model.add(Bidirectional(LSTM(100)))
model.add(Dense(1, activation='relu'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           192032    
                                                                 
 bidirectional (Bidirectiona  (None, 200)              106400    
 l)                                                              
                                                                 
 dense (Dense)               (None, 1)                 201       
                                                                 
Total params: 298,633
Trainable params: 298,633
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train_pad,y_train, epochs=30, batch_size=64)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(list_tokenized_test, maxlen=max_review_length)
prediction = model.predict(X_test_pad)
y_pred = (prediction > 0.5)
print("Accuracy of the model : ", accuracy_score(y_pred, y_test))
print('F1-score: ', f1_score(y_pred, y_test))
print('Confusion matrix:')
confusion_matrix(y_test,y_pred)

Accuracy of the model :  0.7583333333333333
F1-score:  0.7387387387387387
Confusion matrix:


array([[200,  52],
       [ 64, 164]])

In [None]:
from sklearn.metrics import classification_report

In [None]:
 print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.76      0.79      0.78       252
           1       0.76      0.72      0.74       228

    accuracy                           0.76       480
   macro avg       0.76      0.76      0.76       480
weighted avg       0.76      0.76      0.76       480

