===========================================

Title: 10.2 Exercises

Author: Chad Wood

Date: 2 Mar 2022

Modified By: Chad Wood

Description: This program demonstrates building several machine learning models; two unsupervised models, two supervised models, and two supervised deep neural network models.

===========================================

In [102]:
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd

#### Data Preprocessing

In [96]:
# Data cleaned from exercises 10.2
df = pd.read_csv('data/clean_hotel_reviews.csv') # Sentiment: 0=happy; 1=not happy

# Updates Sentiment: 0=not happy; 1-happy
df.sentiment = [0 if sentiment == 1 else 1 
                for sentiment in df['sentiment'].values]

# Equals the amount of data for either review (positive or negative)
# Data is organised randomly
tmp_bad_review_df = df.loc[df.sentiment == 1].sample(n=12411, random_state=1) # 12411 random bad reviews
tmp_ood_review_df = df.loc[df.sentiment == 0].sample(n=12411, random_state=1) # 12411 random good reviews

# Combines random samples of equal size
processed_df = pd.concat([tmp_bad_review_df.sample(frac=1, random_state=1), # Randomly shuffles data
                          tmp_ood_review_df.sample(frac=1, random_state=1)], # Randomly shuffles data
                         ignore_index=True)

# Randomly shuffles data
processed_df = processed_df.sample(frac=1, random_state=1)

In [97]:
import re

def normalize_corpus(corpus): 
    # To lowercase
    corpus = corpus.apply(lambda x: str(x).lower())

    # Creates uniform linebreaks
    corpus = corpus.apply(lambda x: re.sub(r'[\r|\n|\r\n]+', '\n', x))

    # Removes double+ spaces
    corpus = corpus.apply(lambda x: re.sub(' +', ' ', x))
    corpus = corpus.apply(lambda x: x.strip())
    
    return corpus

processed_df.description= normalize_corpus(processed_df.description)

print('Random bad Reviews:', len(tmp_bad_review_df))
print('Random Good Reviews:', len(tmp_ood_review_df))
print('Random combined Reviews:', len(processed_df))
processed_df.head()

Random bad Reviews: 12411
Random Good Reviews: 12411
Random combined Reviews: 24822


Unnamed: 0,sentiment,description
6549,1,good business trip weekend \n kind old inhot a...
13822,0,i not sure la quinta inn price line but experi...
13610,0,family stay weekend thru family reunion suite ...
18072,0,i consider place stay since budget decent revi...
10206,1,i stay day san diego marriott mission valley h...


In [98]:
import numpy as np

# Splits data for supervised model
train, validate, test = np.split(processed_df.sample(frac=1, random_state=42), # Shuffles data
                                 [int(.7*len(processed_df)), int(.85*len(processed_df))]) # Splits 0-70%; 70-85%; 85-100%


print('Reviews:',
      'Train ', train.description.shape, 
      'Validate ', validate.description.shape, 
      'Test', test.description.shape)

print('Sentiments:',
      'Train ', train.sentiment.shape, 
      'Validate ', validate.sentiment.shape, 
      'Test', test.sentiment.shape)

Reviews: Train  (17375,) Validate  (3723,) Test (3724,)
Sentiments: Train  (17375,) Validate  (3723,) Test (3724,)


#### Model Setup

In [114]:
# Training input
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'review': train.description.values}, 
    train.sentiment.values, # Took me way too long to figure out using pandas.Series here is an issue
    batch_size=256, 
    num_epochs=None, 
    shuffle=True)

# Prediction for training set
predict_train_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'review': train.description.values}, 
    train.sentiment.values, 
    shuffle=False)

# Prediction for validation set
predict_val_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'review': validate.description.values}, 
    validate.sentiment.values, 
    shuffle=False)

# Prediction for test set
predict_test_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'review': test.description.values}, 
    test.sentiment.values, 
    shuffle=False)

#### Building The Model

In [115]:
# Instantiates sentence embeding feature
embedding_feature = hub.text_embedding_column(
    key='review',
    module_spec="https://tfhub.dev/google/universal-sentence-encoder/2", # Leverages Universal Sentence Encoder
    trainable=False)

In [116]:
# Standard DeepNN with two hidden layers
dnn = tf.estimator.DNNClassifier(
    hidden_units=[512, 128],
    feature_columns=[embedding_feature],
    n_classes=2,
    activation_fn=tf.nn.relu,
    dropout=0.1,
    optimizer=tf.train.AdagradOptimizer(learning_rate=0.005))

#### Training Model

In [117]:
import time

tf.logging.set_verbosity(tf.logging.ERROR)

# Reduced from book example for easier computation
TOTAL_STEPS = 100
STEP_SIZE = 10

for step in range(0, TOTAL_STEPS+1, STEP_SIZE):
    print()
    print('-'*50)
    print('Training for step: ', step)
    
    start_time = time.time()
    dnn.train(input_fn=train_input_fn, steps=STEP_SIZE)
    elapsed_time = time.time() - start_time
    
    print('Seconds Elapsed:', elapsed_time)
    print('Eval (Train):', dnn.evaluate(input_fn=predict_train_input_fn))
    print('Eval (Validation):', dnn.evaluate(input_fn=predict_val_input_fn))


--------------------------------------------------
Training for step:  0
Seconds Elapsed: 58.52726340293884
Eval (Train): {'accuracy': 0.72529495, 'accuracy_baseline': 0.5010648, 'auc': 0.84812313, 'auc_precision_recall': 0.831663, 'average_loss': 0.5533835, 'label/mean': 0.5010648, 'loss': 70.698814, 'precision': 0.83946145, 'prediction/mean': 0.41545993, 'recall': 0.5585803, 'global_step': 10}
Eval (Validation): {'accuracy': 0.7287134, 'accuracy_baseline': 0.5071179, 'auc': 0.8489317, 'auc_precision_recall': 0.8314737, 'average_loss': 0.5495644, 'label/mean': 0.49288207, 'loss': 68.20095, 'precision': 0.83509344, 'prediction/mean': 0.41456386, 'recall': 0.560218, 'global_step': 10}

--------------------------------------------------
Training for step:  10
Seconds Elapsed: 44.85155534744263
Eval (Train): {'accuracy': 0.77277696, 'accuracy_baseline': 0.5010648, 'auc': 0.8711769, 'auc_precision_recall': 0.86095464, 'average_loss': 0.49043998, 'label/mean': 0.5010648, 'loss': 62.657314,

#### Model Evaluation Metrics

In [118]:
# Inputs train and test
dnn.evaluate(input_fn=predict_train_input_fn)
dnn.evaluate(input_fn=predict_test_input_fn)

{'accuracy': 0.8149839,
 'accuracy_baseline': 0.5021482,
 'auc': 0.90004086,
 'auc_precision_recall': 0.89932245,
 'average_loss': 0.40039706,
 'label/mean': 0.5021482,
 'loss': 49.70262,
 'precision': 0.81970763,
 'prediction/mean': 0.49094212,
 'recall': 0.8096257,
 'global_step': 110}