In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, GlobalMaxPooling1D, Conv1D, Bidirectional, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from kerastuner import HyperModel

from kerastuner import BayesianOptimization
from kerastuner.engine.hyperparameters import HyperParameters
from kerastuner.tuners import Hyperband

import preprocesing_functions as pf


In [2]:

class SentimentAnalysisHyperModel(HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape

    def build(self, hp):
        
        global num_words

        #num_words = hp.Int("num_words", min_value=5000, max_value=20000, step=5000)
        
        input_layer = Input(shape=self.input_shape)
        x = Embedding(num_words, hp.Int("embedding_size", min_value=64, max_value=256, step=32))(input_layer)
        
        layer_choice = hp.Choice("layer_choice", ["lstm", "bidirectional_lstm", "conv1d"])
        
        if layer_choice == "lstm":
            x = LSTM(hp.Int("lstm_units", min_value=32, max_value=128, step=32))(x)
        elif layer_choice == "bidirectional_lstm":
            x = Bidirectional(LSTM(hp.Int("lstm_units", min_value=32, max_value=128, step=32)))(x)
        else:  # layer_choice == "conv1d"
            x = Conv1D(hp.Int("conv_filters", min_value=32, max_value=128, step=32), kernel_size=3, activation="relu")(x)
            x = GlobalMaxPooling1D()(x)
            
        if hp.Choice("dropout", [0, 1]):
            x = Dropout(hp.Float("dropout_rate", min_value=0.1, max_value=0.5, step=0.1))(x)
            
        x = Dense(hp.Int("dense_units", min_value=32, max_value=128, step=32), activation="relu")(x)

        output_layer = Dense(1, activation="sigmoid")(x)

        model = Model(input_layer, output_layer)
        model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

        return model


In [3]:
data_path = './IMDB_dataset.csv/IMDB_dataset.csv'

reviews = pd.read_csv(data_path)

In [4]:
X, y = pf.preprocess_data(reviews)

In [5]:
num_words = 1000
max_sequence_length = 100

tokenizer = Tokenizer(num_words = num_words, oov_token = '<OOV>')

tokenizer.fit_on_texts(X)

X_vecs = tokenizer.texts_to_sequences(X)


#Padding might not be necesary in the case that we use an incidence matrix. Also note that this holds no order.
padded_X_vecs = tf.keras.preprocessing.sequence.pad_sequences(X_vecs, maxlen = max_sequence_length, padding="post")


In [6]:
input_shape = (max_sequence_length,)
num_words = 1000  # Set this based on your dataset
hypermodel = SentimentAnalysisHyperModel(input_shape)

We also imported the Hyperband optimizer. Note that it could be used later on. We would like to test both approaches. 

In [7]:
bayesian_tuner = BayesianOptimization(
    hypermodel,
    objective = "val_accuracy",
    max_trials = 50,
    num_initial_points = 2,
    directory = "output",
    project_name = "IMDB Sentiment Analysis"
)

In [8]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor = "val_accuracy",
    patience = 3,
    restore_best_weights = True
)

In [9]:
bayesian_tuner.search(
    padded_X_vecs,
    y,
    batch_size = 30,
    epochs = 20,
    validation_split = .2,
    callbacks = [early_stopping_callback]
)


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
160               |160               |embedding_size
conv1d            |conv1d            |layer_choice
128               |128               |lstm_units
0                 |0                 |dropout
64                |64                |dense_units

Epoch 1/20
 227/1334 [====>.........................] - ETA: 28s - loss: 0.5812 - accuracy: 0.6906

KeyboardInterrupt: 

In [None]:
bayesian_best_model = tuner.get_best_models(num_models = 1)[0]

#### Hyperband approach

In [10]:
hyperband_tuner = Hyperband(
    hypermodel,
    objective = "val_accuracy",
    max_epochs = 20,
    factor = 3,
    directory = "output",
    project_name = "IMDB Sentiment Analysis"
)

In [11]:
hyperband_tuner.search(
    padded_X_vecs,
    y,
    batch_size = 30,
    epochs = 20,
    validation_split = .2,
    callbacks = [early_stopping_callback])


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
224               |224               |embedding_size
lstm              |lstm              |layer_choice
128               |128               |lstm_units
1                 |1                 |dropout
96                |96                |dense_units
3                 |3                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/3
  35/1334 [..............................] - ETA: 3:37 - loss: 0.6903 - accuracy: 0.5210

KeyboardInterrupt: 