In [18]:
#import libraries
import tensorflow.keras.layers as layers
import tensorflow.keras.backend as K
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.model_selection import train_test_split
import pandas as pd

In [19]:
tweet_df=pd.read_csv('/content/tweets_training.csv', index_col=0)

In [20]:
#create Text Vectorization Layer
X=tweet_df['tweet_text'].tolist()
y=tweet_df['label'].tolist()
X_train, X_test,y_train,y_test=train_test_split(X,y, test_size = 0.3, random_state = 42)
X_valid, X_test, y_valid, y_test=train_test_split(X_test,y_test, test_size = 0.5, random_state = 42)
max_len=140
vect_layer=TextVectorization(output_mode='int', output_sequence_length=max_len, ngrams=(1,2))
vect_layer.adapt(X_train)

In [21]:
n_vocab=len(vect_layer.get_vocabulary())

In [10]:
!pip install -q -U keras-tuner

In [22]:
import keras_tuner as kt

In [23]:
#early stopping callback to avoid overfit model
import tensorflow as tf
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
      monitor='val_loss', patience=3, verbose=1,
    mode='auto', min_delta=0.001
)

In [24]:
#parameter tuning for lstm model
import keras
import tensorflow as tf
def model_builder_lstm(hp):
    model=keras.Sequential()
    model.add(layers.Input(shape=(1,), dtype=tf.string))
    model.add(vect_layer)
    model.add(layers.Embedding(input_dim=n_vocab, output_dim=max_len))
    hp_dropouts=hp.Float('Dropout_rate', min_value=0.1, max_value=0.5, step=0.1)
    model.add(layers.SpatialDropout1D(rate=hp_dropouts))
    hp_units=hp.Int('units', min_value=32, max_value=512, step=32)
    model.add(layers.Bidirectional(layers.LSTM(units=hp_units)))
    
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [26]:
tuner=kt.BayesianOptimization(model_builder_lstm,
                    objective='val_accuracy', max_trials=25,
                    directory='bayes_lstm_5',
                     project_name='bayes_lstm_5')

In [27]:
tuner.search(X_train, y_train,epochs=10, validation_data=(X_valid,y_valid), verbose=1, callbacks=[early_stopping_callback])

Trial 25 Complete [00h 00m 25s]
val_accuracy: 0.9090300798416138

Best val_accuracy So Far: 0.9250836372375488
Total elapsed time: 00h 13m 25s


In [28]:
best_model_lstm = tuner.get_best_models(num_models=1)[0]

In [29]:
best_model_lstm.evaluate(X_test, y_test)



[0.22107043862342834, 0.9324414730072021]

In [33]:
params_lstm=tuner.get_best_hyperparameters()[0].values

In [34]:
params_lstm

{'Dropout_rate': 0.30000000000000004, 'units': 512}

In [39]:
best_model_lstm.save('/content/drive/MyDrive/omdena_self_harm_project/lstm_model_2')



In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
