In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import re

In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
df = pd.read_csv('hotel-reviews.csv')

In [4]:
df.head(2)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy


In [5]:
a = pd.get_dummies(df.Is_Response,prefix='res')

In [6]:
df_label = pd.concat([df,a],axis=1)

In [7]:
df_label.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,res_happy,res_not happy
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy,0,1
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy,0,1
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy,0,1
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy,1,0
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy,0,1


In [8]:
df1 = df_label.drop(columns=['Is_Response','res_not happy'])

In [9]:
df1.describe(include="O")

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
count,38932,38932,38932,38932
unique,38932,38932,11,3
top,id23344,I stayed at the Renaissance for six nights and...,Firefox,Desktop
freq,1,1,7367,15026


In [10]:
def text_process(text):
#   word = text.lower()
#   word = word.split()
    word = re.sub('[^A-za-z]',' ',text)
    word = word.lower()
    word = word.split()
    word = [words for words in word if words not in stopwords.words('english')]
    word = [WordNetLemmatizer().lemmatize(words) for words in word]
    word = ' '.join(word)
    return word

In [11]:
df1['Reviews'] = df1['Description'].iloc[:].apply(text_process)

In [12]:
df1.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,res_happy,Reviews
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,0,room kind clean strong smell dog generally ave...
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,0,stayed crown plaza april april staff friendly ...
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,0,booked hotel hotwire lowest price could find g...
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,1,stayed husband son way alaska cruise loved hot...
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,0,girlfriend stayed celebrate th birthday planne...


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
cv = CountVectorizer()
X = cv.fit_transform(df1['Reviews'].iloc[:10000]).toarray()
# X = df1['Reviews']
y = df1['res_happy'].iloc[:10000]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [15]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
y_pred = classifier.predict(X_test)

In [17]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [18]:
cm

array([[ 696,  263],
       [ 145, 1896]], dtype=int64)

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.73      0.77       959
           1       0.88      0.93      0.90      2041

    accuracy                           0.86      3000
   macro avg       0.85      0.83      0.84      3000
weighted avg       0.86      0.86      0.86      3000



In [20]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.864


In [13]:
from tensorflow import keras
from tensorflow.keras.layers import Dense,Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer,text_to_word_sequence
from tensorflow.keras.preprocessing import sequence
# max_feature = 2000

In [122]:
X_train,X_test,y_train,y_test = train_test_split(df1['Reviews'],df1['res_happy'],test_size = 0.3,random_state = 0)

In [123]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [124]:
train_X = tokenizer.texts_to_sequences(X_train)
test_X = tokenizer.texts_to_sequences(X_test)

In [125]:
vocab_size = len(tokenizer.word_index) + 1 

In [126]:
from keras.preprocessing.sequence import pad_sequences
max_len = 100
train_X = pad_sequences(train_X,padding='post',maxlen=max_len)
test_X = pad_sequences(test_X,padding='post',maxlen=max_len)

In [133]:
batch_size = 32
model = Sequential()
model.add(Embedding(input_dim = vocab_size,output_dim = 64,input_length = max_len))
model.add(LSTM(max_len,dropout = 0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 64)           2250944   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 2,317,045
Trainable params: 2,317,045
Non-trainable params: 0
_________________________________________________________________
None


In [134]:
model.fit(train_X, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_data=(test_X, y_test))

Train on 27252 samples, validate on 11680 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x266ab8f4978>