In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow
# from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import re

In [3]:
data = pd.read_csv('../data/tripadvisor/tripadvisor_hotel_reviews.csv')

In [4]:
data.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [192]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [180]:
X = data['Review']
y = data['Rating']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)

### Topic Modeling

Usually, In the hotel industry. The Enterprenue recived the useful information form data in text uniform such as The comment in social network or email from the customers. 
The Enterprenue may use the useful information and group the useful information for analysis or make the the better goods and 
service

In this section. topic modeling usually use for anaylyze what the topic of the review is?

In [8]:
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
def text_process(text):
    text = re.sub('[^A-za-z]'," ",text)
    text = text.lower()
    text = text.split()
    text = [word for word in text if word not in stopwords.words('english')]
#     text = [SnowballStemmer('english').stem(word) for word in text]
    text = " ".join(text)
    return text

In [9]:
X_process = X.apply(text_process)

In [10]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

## Preprocess Text to count vector
cv = CountVectorizer(max_df=0.9,min_df=3)
cv.fit(X_process)
X_vec = cv.transform(X_process.iloc[:]).toarray()

# tf = TfidfVectorizer(max_df=0.9,min_df=2)
# X_tf = tf.fit_transform(X_process).toarray()

In [11]:
## Latent Dirichlet Allocation
lda = LatentDirichletAllocation(n_components=7)
lda.fit(X_vec)

LatentDirichletAllocation(n_components=7)

#### Allocate the topic to vector


In [12]:
word_topic = {}
for i,w in enumerate(lda.components_):
    word_topic[f'topic_{i}'] = [cv.get_feature_names()[j] for j in lda.components_[i].argsort()[-10:]]

In [13]:
word_topic

{'topic_0': ['check',
  'staff',
  'day',
  'service',
  'stay',
  'night',
  'told',
  'desk',
  'hotel',
  'room'],
 'topic_1': ['clean',
  'stayed',
  'rooms',
  'good',
  'stay',
  'room',
  'staff',
  'location',
  'great',
  'hotel'],
 'topic_2': ['barcelona',
  'nice',
  'stay',
  'old',
  'staff',
  'great',
  'juan',
  'room',
  'san',
  'hotel'],
 'topic_3': ['stay',
  'rooms',
  'clean',
  'staff',
  'location',
  'great',
  'good',
  'breakfast',
  'room',
  'hotel'],
 'topic_4': ['service',
  'view',
  'stayed',
  'rooms',
  'good',
  'stay',
  'nice',
  'great',
  'room',
  'hotel'],
 'topic_5': ['room',
  'people',
  'day',
  'time',
  'pool',
  'good',
  'great',
  'food',
  'resort',
  'beach'],
 'topic_6': ['experience',
  'stayed',
  'rooms',
  'like',
  'new',
  'staff',
  'room',
  'service',
  'stay',
  'hotel']}

In [209]:
topic_model = lda.transform(X_vec)

In [215]:
topic_model.argmax(axis=1)

array([0, 3, 0, ..., 0, 3, 5], dtype=int64)

### Topic Modeling by Gensim

In [108]:
import gensim.corpora as corpora
from gensim.models import LdaMulticore

# dictionary = corpora.Dictionary([X_process[:10]])
# dictionary = [corpora.Dictionary([doc.split()]) for doc in X_process[:10]]

In [98]:
temp = []
for doc in X_process[:10]:
    temp.append(doc.split())

In [101]:
dictionary = corpora.Dictionary(temp)

In [106]:
corpus = [dictionary.doc2bow(doc.split()) for doc in X_process[:10]]

In [111]:
# corpora.Dictionary([X_process[0]])
lda = LdaMulticore(corpus=corpus,num_topics=5,id2word=dictionary)

In [113]:
lda.print_topics()

[(0,
  '0.017*"room" + 0.015*"hotel" + 0.013*"n" + 0.013*"great" + 0.009*"stay" + 0.008*"nice" + 0.007*"staff" + 0.007*"suite" + 0.006*"bed" + 0.005*"night"'),
 (1,
  '0.014*"room" + 0.012*"stay" + 0.011*"hotel" + 0.010*"n" + 0.009*"great" + 0.007*"good" + 0.007*"nice" + 0.007*"staff" + 0.006*"night" + 0.005*"suite"'),
 (2,
  '0.017*"room" + 0.013*"hotel" + 0.009*"night" + 0.008*"nice" + 0.008*"desk" + 0.008*"suite" + 0.007*"good" + 0.007*"great" + 0.007*"staff" + 0.006*"stay"'),
 (3,
  '0.019*"hotel" + 0.013*"room" + 0.010*"great" + 0.008*"suite" + 0.007*"building" + 0.006*"like" + 0.006*"stay" + 0.006*"nice" + 0.006*"monaco" + 0.005*"good"'),
 (4,
  '0.016*"staff" + 0.014*"room" + 0.011*"hotel" + 0.010*"great" + 0.009*"stay" + 0.008*"desk" + 0.008*"n" + 0.007*"suite" + 0.007*"good" + 0.006*"nice"')]

### Multiclass Classification

In [157]:
from tensorflow.keras.preprocessing.text import Tokenizer,one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense
from tensorflow.keras.utils import to_categorical

In [120]:
token = Tokenizer()

In [121]:
token.fit_on_texts(X_train)

In [124]:
x_train = token.texts_to_sequences(X_train)
x_test = token.texts_to_sequences(X_test)

In [134]:
x_train = pad_sequences(x_train,maxlen=150,padding='post')
x_test = pad_sequences(x_test,maxlen=150,padding='post')

In [211]:
y_train = to_categorical(y_train,num_classes=6)
y_test = to_categorical(y_test,num_classes=6)

In [213]:
vocab_size = len(token.word_index) + 1 

model = Sequential()
model.add(Embedding(input_dim = vocab_size,output_dim = 64,input_length = 150))
model.add(LSTM(128,dropout = 0.2))
model.add(Dense(6,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [214]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 150, 64)           2774336   
_________________________________________________________________
lstm_8 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 774       
Total params: 2,873,926
Trainable params: 2,873,926
Non-trainable params: 0
_________________________________________________________________


In [215]:
history = model.fit(x_train,y_train,batch_size=128,epochs=113,validation_data=(x_test,y_test))

Epoch 1/113
 10/113 [=>............................] - ETA: 45s - loss: 1.7513 - accuracy: 0.2966

KeyboardInterrupt: 

In [170]:
data['Rating'][0]

4

In [144]:
14343/128

112.0546875

In [205]:
a[15]

array([0., 1., 0., 0., 0., 0., 0.], dtype=float32)

In [186]:
pd.value_counts(data['Rating'])

5    9054
4    6039
3    2184
2    1793
1    1421
Name: Rating, dtype: int64

In [195]:
data['Rating'].min()

1