In [1]:
# Build neural network

# read training data
import tensorflow as tf
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

#df = pd.Dataframe()
df_train = pd.read_csv('./measuring-customer-happiness/train_hp.csv', encoding='utf-8')
print(df_train.head(3))

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


   User_ID                                        Description  \
0  id10326  The room was kind of clean but had a VERY stro...   
1  id10327  I stayed at the Crown Plaza April -- - April -...   
2  id10328  I booked this hotel through Hotwire at the low...   

        Browser_Used Device_Used Is_Response  
0               Edge      Mobile   not happy  
1  Internet Explorer      Mobile   not happy  
2            Mozilla      Tablet   not happy  


In [2]:
df_train = df_train[['Description', 'Is_Response']]
print(df_train.head())

                                         Description Is_Response
0  The room was kind of clean but had a VERY stro...   not happy
1  I stayed at the Crown Plaza April -- - April -...   not happy
2  I booked this hotel through Hotwire at the low...   not happy
3  Stayed here with husband and sons on the way t...       happy
4  My girlfriends and I stayed here to celebrate ...   not happy


In [3]:
# select only rows with happy values and reduce happy values to same amount as not happy values
df_happy = df_train.loc[df_train['Is_Response'] == 'happy']
df_not_happy = df_train.loc[df_train['Is_Response'] == 'not happy']
print(df_happy)

                                             Description Is_Response
3      Stayed here with husband and sons on the way t...       happy
5      We had - rooms. One was very nice and clearly ...       happy
7      My wife & I stayed in this glorious city a whi...       happy
8      My boyfriend and I stayed at the Fairmont on a...       happy
10     Steps off Times Square, nice rooms, stayed - n...       happy
11     Me, the Wife and - kids stayed here on Valenti...       happy
13     I highly recommend the Hawthorne Terrace as an...       happy
14     I found the hotel clean and nicely located. Go...       happy
15     Stayed at the Elan from --th to --th October a...       happy
18     We stayed here for - nights and were really ha...       happy
21     This is everything you could want from a hotel...       happy
23     I really liked this hotel. The staff were wond...       happy
24     My wife and spent - days there this month as a...       happy
26     Took a girls trip to LA and

In [4]:
# only get the first 12411 rows of happy dataframe
df_happy = df_happy.head(12411)
print(df_happy)

                                             Description Is_Response
3      Stayed here with husband and sons on the way t...       happy
5      We had - rooms. One was very nice and clearly ...       happy
7      My wife & I stayed in this glorious city a whi...       happy
8      My boyfriend and I stayed at the Fairmont on a...       happy
10     Steps off Times Square, nice rooms, stayed - n...       happy
11     Me, the Wife and - kids stayed here on Valenti...       happy
13     I highly recommend the Hawthorne Terrace as an...       happy
14     I found the hotel clean and nicely located. Go...       happy
15     Stayed at the Elan from --th to --th October a...       happy
18     We stayed here for - nights and were really ha...       happy
21     This is everything you could want from a hotel...       happy
23     I really liked this hotel. The staff were wond...       happy
24     My wife and spent - days there this month as a...       happy
26     Took a girls trip to LA and

In [5]:
frames = [df_happy, df_not_happy]
df_train = pd.concat(frames)
print(df_train)

                                             Description Is_Response
3      Stayed here with husband and sons on the way t...       happy
5      We had - rooms. One was very nice and clearly ...       happy
7      My wife & I stayed in this glorious city a whi...       happy
8      My boyfriend and I stayed at the Fairmont on a...       happy
10     Steps off Times Square, nice rooms, stayed - n...       happy
11     Me, the Wife and - kids stayed here on Valenti...       happy
13     I highly recommend the Hawthorne Terrace as an...       happy
14     I found the hotel clean and nicely located. Go...       happy
15     Stayed at the Elan from --th to --th October a...       happy
18     We stayed here for - nights and were really ha...       happy
21     This is everything you could want from a hotel...       happy
23     I really liked this hotel. The staff were wond...       happy
24     My wife and spent - days there this month as a...       happy
26     Took a girls trip to LA and

In [6]:
import seaborn as sns

sns.countplot(x='Is_Response', data=df_train)

<matplotlib.axes._subplots.AxesSubplot at 0x1a26deee80>

# Data Preprocessing

In [7]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    return sentence

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

X = []
sentences = list(df_train['Description'])
for sen in sentences:
    X.append(preprocess_text(sen))

In [8]:
X[3]


'My boyfriend and stayed at the Fairmont on recent trip to San Francisco could not recommend this hotel more called the hotel few weeks before to order cake to be delivered to our room on our first night to celebrate my boyfriend birthday was immediately connected directly to the pastry chef who helped me to design the most delicious cake have ever eaten in my entire life The rooms are large and luxurious with wonderful old world feel Most importantly don miss dinner at the Tonga Room What fun restaurant Definitely order Scorpion Bowl but only if you re staying in the hotel they re strong '

In [9]:
# binary classification for happy and not_happy 

y = df_train['Is_Response']

y = np.array(list(map(lambda x: 1 if x=="happy" else 0, y)))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [11]:
# Prepare embedding layer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [12]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


In [13]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('./glove.twitter.27B/glove.twitter.27B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [14]:
embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector


# Recurrent Neural Network: LSTM (Long Short Term Memory network)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [16]:
from keras.layers.recurrent import LSTM
import keras as k
from keras.layers import Embedding
model = k.models.Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)
model.add(LSTM(128))

model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

Instructions for updating:
Colocations handled automatically by placer.


In [17]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          3451700   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 3,569,077
Trainable params: 117,377
Non-trainable params: 3,451,700
_________________________________________________________________
None


# Model Training & Evaluation 

In [18]:
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

score = model.evaluate(X_test, y_test, verbose=1)


AttributeError: 'str' object has no attribute 'ndim'

In [None]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()


# Predict sentiments

In [None]:
instance = X[57]
print(instance)

####  - convert review into numeric form (using the tokenizer)
####  - text_to_sequences method will convert the sentence into its numeric counter part
####  - positive = 1, negative = 0
####  - sigmoid function predicts floating value between 0 and 1. 
####  - value < 0.5 = negative sentiment 
####  - value > 0.5 = positive sentiment 


In [None]:
instance = tokenizer.texts_to_sequences(instance)

flat_list = []
for sublist in instance:
    for item in sublist:
        flat_list.append(item)

flat_list = [flat_list]

instance = pad_sequences(flat_list, padding='post', maxlen=maxlen)

model.predict(instance)