In [63]:
import numpy as np
import pandas as pd


In [64]:
df= pd.read_csv("/content/TripAdvisor_Hotel_Review_Dataset.csv")
df.sample(4)

Unnamed: 0,Lable,Text
444,deceptive,"The Omni Chicago is hands down, the best hotel..."
485,deceptive,My wife and I's stay at the Sheraton Chicago ...
707,deceptive,My stay at the James Hotel in Chicago was fant...
630,deceptive,There were many positives when staying in this...


In [65]:
## converting target column into the numeric
df["Lable"].replace({"truthful":1,"deceptive":0},inplace=True)

-> this data is about true or fake reviews.

we have to do data preprocesing before doing prediction.

steps are
1. Remove Links
2. Remove punctuation
3. Remove numbers
4. Remove emojis
5. Remove stops words
6. stamming the words (convert words into its root form)



In [66]:
# Remove Stopwords and convert into lowercase
import nltk
#from nltk.corpus import stopword
nltk.download('stopwords')
stop = nltk.corpus.stopwords.words('english')

def rem_stopwords(input_txt):
    words = input_txt.lower().split()
    noise_free_words = [word for word in words if word not in stop]
    noise_free_text = " ".join(noise_free_words)
    return noise_free_text

df["Clean_Text"] = df["Text"].apply(lambda s: rem_stopwords(s))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
import re
df["Clean_Text"] = df["Clean_Text"].apply(lambda s: ' '.join(re.sub("[^a-zA-Z\s\']", " ", s).split()))
df[["Text","Clean_Text"]]

Unnamed: 0,Text,Clean_Text
0,We stayed for a one night getaway with family ...,stayed one night getaway family thursday tripl...
1,Triple A rate with upgrade to view room was le...,triple rate upgrade view room less also includ...
2,This comes a little late as I'm finally catchi...,comes little late i'm finally catching reviews...
3,The Omni Chicago really delivers on all fronts...,omni chicago really delivers fronts spaciousne...
4,I asked for a high floor away from the elevato...,asked high floor away elevator got room pleasa...
...,...,...
795,The InterContinental Chicago Hotel is one of t...,intercontinental chicago hotel one finest hote...
796,My wife and me stayed in the Amalfi Hotel Chic...,wife stayed amalfi hotel chicago last august w...
797,"Last month, my husband and I stayed at the Int...",last month husband stayed intercontinental chi...
798,Amazing! I was swept away when I walked into t...,amazing swept away walked hotel gorgeous staff...


In [68]:
## removing numbers
df["Clean_Text"].replace('\d+', '', regex=True, inplace=True)

In [69]:
def remove_Emojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')
df["Clean_Text"] = df["Clean_Text"].apply(lambda s: remove_Emojify(s))

In [70]:
## to remove the special characters
def remove_special(text):
  x=" "
  for i in text:
    if i.isalnum():
      x=x+i
    else:
      x=x+" "
  return x
df["Clean_Text"] = df["Clean_Text"].apply(remove_special)

In [71]:
from nltk.tokenize import RegexpTokenizer
tokeniser = RegexpTokenizer(r'\w+')
df["Clean_Text"] = df["Clean_Text"].apply(lambda x: tokeniser.tokenize(x))
df[["Text","Clean_Text"]]

Unnamed: 0,Text,Clean_Text
0,We stayed for a one night getaway with family ...,"[stayed, one, night, getaway, family, thursday..."
1,Triple A rate with upgrade to view room was le...,"[triple, rate, upgrade, view, room, less, also..."
2,This comes a little late as I'm finally catchi...,"[comes, little, late, i, m, finally, catching,..."
3,The Omni Chicago really delivers on all fronts...,"[omni, chicago, really, delivers, fronts, spac..."
4,I asked for a high floor away from the elevato...,"[asked, high, floor, away, elevator, got, room..."
...,...,...
795,The InterContinental Chicago Hotel is one of t...,"[intercontinental, chicago, hotel, one, finest..."
796,My wife and me stayed in the Amalfi Hotel Chic...,"[wife, stayed, amalfi, hotel, chicago, last, a..."
797,"Last month, my husband and I stayed at the Int...","[last, month, husband, stayed, intercontinenta..."
798,Amazing! I was swept away when I walked into t...,"[amazing, swept, away, walked, hotel, gorgeous..."


In [72]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatiser = WordNetLemmatizer()
df["Clean_Text"] = df["Clean_Text"].apply(lambda tokens: [lemmatiser.lemmatize(token, pos='v') for token in tokens])
df['Clean_Text'] = df['Clean_Text'].apply(lambda x: ' '.join(x))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [93]:
## now the time to convert into tabular data
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000)
X = cv.fit_transform(df["Clean_Text"]).toarray()

In the above step i have made columns with each unique words from lable column from the original data.

In [87]:
## now we are ready to apply naive bayes machine learning algorithm
##training set
## test set
##X,y

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df["Lable"], test_size=0.33, random_state=42)

In [88]:
from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
clf1 = naive_bayes.MultinomialNB()
clf2 = naive_bayes.BernoulliNB()
clf3 = naive_bayes.GaussianNB()


In [89]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

In [90]:
y_pred1=clf1.predict(X_test)
y_pred2=clf2.predict(X_test)
y_pred3=clf3.predict(X_test)

In [91]:
from sklearn.metrics import accuracy_score
print("Multinomian",accuracy_score(y_test,y_pred1))
print("Bernuli",accuracy_score(y_test,y_pred2))
print("Gaussian",accuracy_score(y_test,y_pred3))

Multinomian 0.9015151515151515
Bernuli 0.8712121212121212
Gaussian 0.7196969696969697
