<a href="https://colab.research.google.com/github/KevinHern/SemOpLabs/blob/master/Lab5/Laboratorio5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import os, re
import numpy as np
import pandas as pd
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM

# Dataset

## Preparing to use Kaggle

In [0]:
# Uploading my kaggle.json
from google.colab import files
files.upload()

In [0]:
# Making sure kaggle.json is present
!ls -lha kaggle.json

In [0]:
# Installing kaggle
!pip install -q kaggle

In [0]:
# Doing some kaggle shenanigans
# Used Tutorial's link: https://colab.research.google.com/drive/1DofKEdQYaXmDWBzuResXWWvxhLgDeVyl#scrollTo=KixiXyagGy7Y
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

## Downloading Dataset: Hotel Reviews

In [0]:
#https://www.kaggle.com/harmanpreet93/hotelreviews
!kaggle datasets download harmanpreet93/hotelreviews

## Binary classificacion: either people are happy or not

In [0]:
!unzip -o 'hotelreviews.zip'


In [0]:
os.listdir()

In [0]:
hotels = pd.read_csv('hotel-reviews.csv')
hotels.head

## Cleanning Dataset

In [0]:
hotels.columns

In [0]:
# Removing unnecesary columns
properties = list(hotels.columns.values)
properties.remove('User_ID')
properties.remove('Browser_Used')
properties.remove('Device_Used')
properties

In [0]:
data = hotels[properties]
data

In [0]:
# Cleaning data
data = data[data['Description'] != '0']
data = data[data['Is_Response'] != '0']
data = data[data['Description'] != '']
data = data[data['Is_Response'] != '']
data

In [0]:
# mapping Is_Response to a number
dataset = data.replace({'not happy': 0, 'happy': 1})
dataset

In [0]:
# List first row of the dataset
dataset.iloc[0]

## Splitting Dataset


In [0]:
# 70% train, 15% test and 15% validation
train_split = dataset.sample(frac = 0.7,random_state=42)
test_split = dataset.drop(train_split.index).sample(frac = 0.5)
validation_split = dataset.drop(train_split.index).drop(test_split.index)

train_split.shape, validation_split.shape, test_split.shape

In [0]:
# Converting to Tensors
train_dataset = tf.data.Dataset.from_tensor_slices((train_split['Description'], train_split['Is_Response']))
test_dataset = tf.data.Dataset.from_tensor_slices((test_split['Description'], test_split['Is_Response']))
validation_dataset = tf.data.Dataset.from_tensor_slices((validation_split['Description'], validation_split['Is_Response']))
train_dataset.element_spec

In [0]:
description, review = next(iter(train_dataset))
description, review

## Shuffling

In [0]:
# Because why not
batch_size = 128
shuffle_buffer_size = 1000

In [0]:
train_dataset = train_dataset.shuffle(shuffle_buffer_size).repeat().batch(batch_size)
validation_dataset = validation_dataset.shuffle(shuffle_buffer_size).batch(batch_size)
test_dataset = test_dataset.shuffle(shuffle_buffer_size).batch(batch_size)

In [0]:
description, review = next(iter(train_dataset))
description.shape, review.shape

## Processing Text

In [0]:
def preprocess_text(sen):
    sentence = re.sub('[^a-zA-Z]', ' ', sen)
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [0]:
train_split_processed = train_split['Description'].apply(lambda x: preprocess_text(x))
test_split_processed = test_split['Description'].apply(lambda x: preprocess_text(x))
validation_split_processed = validation_split['Description'].apply(lambda x: preprocess_text(x))

In [0]:
train_split_processed.iloc[0]

In [0]:
max_len = train_split_processed.map(lambda x: len(x)).max()
max_len

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_split_processed)
train_sequences = tokenizer.texts_to_sequences(train_split_processed)
train_data = pad_sequences(train_sequences, maxlen=max_len, padding='post')

test_sequences = tokenizer.texts_to_sequences(test_split_processed)
test_data  = pad_sequences(test_sequences, maxlen=max_len, padding='post')

validation_sequences = tokenizer.texts_to_sequences(validation_split_processed)
validation_data  = pad_sequences(validation_sequences, maxlen=max_len, padding='post')

train_data.shape, test_data.shape, validation_data.shape

In [0]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
vocab_size = len(tokenizer.word_index) + 1
vocab_size

## Re-building Dataset

In [0]:
train_dataset_embedding = tf.data.Dataset.from_tensor_slices((train_data, train_split['Is_Response']))
test_dataset_embedding = tf.data.Dataset.from_tensor_slices((test_data, test_split['Is_Response']))
validation_dataset_embedding = tf.data.Dataset.from_tensor_slices((validation_data, validation_split['Is_Response']))

train_dataset_embedding.element_spec

In [0]:
description, review = next(iter(train_dataset_embedding))
description, review

In [0]:
train_dataset_embedding = train_dataset_embedding.shuffle(shuffle_buffer_size).repeat().batch(batch_size)
validation_dataset_embedding = validation_dataset_embedding.shuffle(shuffle_buffer_size).batch(batch_size)
test_dataset_embedding = test_dataset_embedding.shuffle(shuffle_buffer_size).batch(batch_size)

In [0]:
description, review = next(iter(train_dataset_embedding))
description.shape

# Model

## Building



In [0]:
model = Sequential([
    Embedding(vocab_size, 128, input_length=max_len),
    Bidirectional(LSTM(64)),
    Dense(16, activation='relu'),
    Dense(4, activation='relu'),   
    Dense(1, activation='sigmoid'),
])

model.summary()

In [0]:
model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

## Training

In [0]:
epochs = 10
train_steps = 10
validation_steps = 5

In [0]:
history = model.fit(train_dataset_embedding,
                    epochs=epochs,
                    steps_per_epoch= train_steps,
                    validation_data=validation_dataset_embedding,
                    validation_steps=validation_steps
                   )         

## Evaluation

In [0]:
eval_loss, eval_acc = model.evaluate(test_dataset_embedding)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))