In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import keras_tuner as kt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
#Hyperparameters
num_epochs = 3
seed = 10
total_vocab = 5000
max_len = 50
batch_size = 100

#data pre-processing code
#import required data
dataset = pd.read_csv('Sentiment Analysis Dataset.csv', on_bad_lines = 'skip')
#split dataset into training and testing data
trn_text, tst_text, trn_sent, tst_sent = train_test_split(dataset['text'], dataset['sentiment'], train_size = .8,
                                                          shuffle = True, random_state = seed, stratify = dataset['sentiment'])

print(dataset)

In [None]:
#declare the tokenizer
tokenizer = keras.preprocessing.text.Tokenizer(num_words = total_vocab, oov_token = 0)
#fit tokenizer vocabulary on the dataset
tokenizer.fit_on_texts(trn_text)
#convert dataset text into sequences
trn_seq_raw = tokenizer.texts_to_sequences(trn_text)
tst_seq_raw = tokenizer.texts_to_sequences(tst_text)
#pad sequences to fixed length
trn_seq = pad_sequences(trn_seq_raw, padding = 'post', maxlen = max_len)
tst_seq = pad_sequences(tst_seq_raw, padding = 'post', maxlen = max_len)
#write tokenizer to json file
with open('vocabulary_tokenizer.json', 'w') as f:
    f.write(tokenizer.to_json())

In [None]:
#define model as a hypermodel for hyperparameter tuning
class LSTM_HModel(kt.HyperModel):
    #new build function
    def build(self, hp):
        #declare length of recurrent and dense layers as a hyperparameters
        recurrent_layer_dim = hp.Int(name = 'recurrent_layer_dim', min_value = 160, max_value = 240, step = 20)
        dense_layer_dim = hp.Int(name = 'dense_layer_dim', min_value = 5, max_value = 25, step = 5)
        model = keras.Sequential()
        #embedding layer declaration
        model.add(keras.layers.Embedding(
            input_dim = total_vocab, 
            output_dim = recurrent_layer_dim,
            input_length = max_len,
            mask_zero = True))
        #recurrent LSTM layer declaration
        model.add(keras.layers.Bidirectional(keras.layers.LSTM(recurrent_layer_dim)))
        #add dense layer
        model.add(keras.layers.Dense(dense_layer_dim, activation = 'softmax'))
        #final output layer
        model.add(keras.layers.Dense(1, activation = 'sigmoid'))
        #compile with binary crossentropy, accuracy as metric
        model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
        return model

    def fit(self, hp, model, *args, **kwargs):
        #overload fit function for potential optimization of batch_size or epochs
        return model.fit(*args,
                         batch_size = batch_size,
                         epochs = num_epochs,
                         **kwargs)


In [None]:
#declare hyperparameters and hypermodel
hyp = kt.HyperParameters()
main = LSTM_HModel()
#build the hypermodel instance
model = main.build(hyp)
#delcare the random_search instance goal is to optimize accuracy
rand_srch = kt.RandomSearch(
    hypermodel = main.build,
    objective = "val_accuracy",
    max_trials = 5,
    executions_per_trial = 1,
    overwrite = False,
    directory = 'hyperparam_tuning')

rand_srch.search(x = trn_seq, y = trn_sent, validation_data = (val_text, val_sent))


In [None]:
#get the optimal hyperparameters nad print them
opt_hyp = rand_srch.get_best_hyperparameters(1)
print(opt_hyp[0].values)
#build and fit a model to those parameters
model = main.build(opt_hyp[0])
main.fit(hyp, model, x = trn_seq, y = trn_sent, validation_data = (tst_seq, tst_sent))

model.summary()

In [None]:
#save the model
model.save('OPT_LSTM_model___NO.h5', save_format = 'h5')