## Hotel reviews - Sentiment analysis

In [8]:
import tensorflow as tf
from tensorflow import keras 
import pandas as pd
import numpy as np

import git
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import re

from keras.layers import Embedding, Dense, LSTM, Dropout
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
repo = git.Repo('.', search_parent_directories=True)
data = train = pd.read_csv(os.path.join(repo.working_tree_dir, "MP2\\train.txt"), sep='\t', header = 0, names = ['sentiment','review'])

In [7]:
data.head()

Unnamed: 0,sentiment,review
0,TRUTHFULPOSITIVE,We stayed at the Omni between Christmas and Ne...
1,DECEPTIVENEGATIVE,I was REALLY looking forward to a nice relaxin...
2,TRUTHFULNEGATIVE,"First let me say, I try not to be too critical..."
3,DECEPTIVENEGATIVE,The Ambassador East Hotel is a terrible place ...
4,DECEPTIVENEGATIVE,I needed a place to stay for a business confer...


In [24]:
x_train, x_test, y_train, y_test = train_test_split(data['review'].values, data['sentiment'].values, test_size=0.2, random_state=42, stratify=data['sentiment'].values)

y_train = LabelBinarizer().fit_transform(y_train)
y_test = LabelBinarizer().fit_transform(y_test)

print(x_train.shape)
print(x_test.shape)

(1119,)
(280,)


In [21]:
#Tokenizing text
max_vocab = 50000
tokenizer = Tokenizer(num_words = max_vocab)
tokenizer.fit_on_texts(x_train)

#Turning text into sequence
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

#Padding sequences
x_train_seq_pad = pad_sequences(x_train_seq)
x_test_seq_pad = pad_sequences(x_test_seq, maxlen = np.array(x_train_seq_pad).shape[1]) 

In [22]:
model = keras.Sequential(
    [
        Embedding(max_vocab + 1, 20, input_length = np.array(x_train_seq_pad).shape[1]),
        LSTM(128, dropout = 0.3, recurrent_dropout = 0.3),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(4, activation='softmax')
    ]
)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 784, 20)           1000020   
                                                                 
 lstm (LSTM)                 (None, 128)               76288     
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 4)                 516       
                                                                 
Total params: 1,093,336
Trainable params: 1,093,336
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(x_train_seq_pad, y_train, batch_size = 32, epochs = 10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20e7d071940>

In [26]:
score = model.evaluate(x_test_seq_pad, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 0.8770999312400818
Test accuracy: 0.6964285969734192


In [27]:
model2 = keras.Sequential(
    [
        Embedding(max_vocab + 1, 20, input_length = np.array(x_train_seq_pad).shape[1]),
        LSTM(128, dropout = 0.3, recurrent_dropout = 0.3),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(4, activation='softmax')
    ]
)
model2.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model2.fit(x_train_seq_pad, y_train, batch_size = 32, epochs = 20, validation_split=0.1)
score = model2.evaluate(x_test_seq_pad, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 1.325403094291687
Test accuracy: 0.7107142806053162
