# Single Layer Unidirectional LSTM Model using word2vec

In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import gensim
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, SpatialDropout1D, Embedding, Bidirectional
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.utils.class_weight import compute_class_weight

In [2]:
df_train = pd.read_csv("train_data.csv")

In [3]:
df_test = pd.read_csv("test_data.csv")

In [4]:
sentences = [tweet.split() for tweet in pd.concat([df_train, df_test])['processed_tweet']]

In [5]:
word2vec_model = Word2Vec(sentences, vector_size=200, window=5, min_count=1, workers=4, sg=1, epochs=20)

In [6]:
EMBEDDING_DIM = 200
LSTM_UNITS = 128
DENSE_UNITS = 64
DROPOUT_RATE = 0.3
RECURRENT_DROPOUT = 0.2
LEARNING_RATE = 0.0005
BATCH_SIZE = 256
EPOCHS = 10

In [7]:
tokenizer = Tokenizer(num_words=30000, oov_token='<OOV>')
tokenizer.fit_on_texts(df_train['processed_tweet'])
word_index = tokenizer.word_index

In [8]:
embedding_matrix = np.zeros((min(30000, len(word_index))+1, 200))
for word, i in tokenizer.word_index.items():
    if i < 30000 and word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

In [9]:
X_train = pad_sequences(tokenizer.texts_to_sequences(df_train['processed_tweet']), maxlen=120, padding='post', truncating='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(df_test['processed_tweet']), maxlen=120, padding='post', truncating='post')

In [20]:
le = LabelEncoder()
y_train = le.fit_transform(df_train['sentiment_label'])
y_test = le.transform(df_test['sentiment_label'])
y_train = pd.get_dummies(y_train).values
y_test = pd.get_dummies(y_test).values

In [11]:
from tensorflow.keras.models import load_model
model = load_model('Bidirectional GRU.h5')



In [None]:
from keras_tuner.tuners import RandomSearch
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dropout, Dense

def model_builder(hp):
    model = Sequential()
    
    # Embedding layer with pretrained weights
    model.add(Embedding(input_dim=min(30000, len(word_index))+1, 
                     output_dim=200,
                     weights=[embedding_matrix],
                     input_length=120,
                     trainable=False))
    
    # Bidirectional GRU layer
    model.add(Bidirectional(GRU(units=hp.Int('gru_units', 64, 256, step=64), 
                        return_sequences=False)))
    
    model.add(Dropout(hp.Float('dropout_rate', 0.2, 0.5, step=0.1)))
    
    # Output layer for 5 classes
    model.add(Dense(5, activation='softmax', 
               kernel_regularizer=l2(hp.Float('l2_reg', 1e-5, 1e-2, sampling='log'))))
    
    model.compile(
        optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-3, 5e-4, 1e-4])),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Initialize tuner
tuner = RandomSearch(
    model_builder,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_dir',
    project_name='sentiment_analysis'
)

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Start hyperparameter search
tuner.search(X_train, y_train,
             epochs=10,
             validation_data=(X_test, y_test),
             callbacks=[early_stopping],
             batch_size=256)

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Save the best model
best_model.save('best_bidirectional_gru_model.h5')

Trial 1 Complete [02h 29m 46s]
val_accuracy: 0.7457984089851379

Best val_accuracy So Far: 0.7457984089851379
Total elapsed time: 02h 29m 46s

Search: Running Trial #2

Value             |Best Value So Far |Hyperparameter
192               |64                |gru_units
0.3               |0.2               |dropout_rate
0.0020249         |0.004075          |l2_reg
0.001             |0.001             |learning_rate

Epoch 1/10
[1m1411/1411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2957s[0m 2s/step - accuracy: 0.6178 - loss: 1.0468 - val_accuracy: 0.6933 - val_loss: 0.8443
Epoch 2/10
[1m 879/1411[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m19:43[0m 2s/step - accuracy: 0.7004 - loss: 0.8263