In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report

In [2]:
tf.random.set_seed(42)
np.random.seed(42)

In [3]:
data = pd.read_csv('/content/drive/MyDrive/kaggle api/cleaned_text.csv')
X = data['Text'].astype(str).tolist()
y = data['Label'].tolist()

In [4]:
max_vocab_size = 20000
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
vocab_size = min(max_vocab_size, len(tokenizer.word_index) + 1)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 20000


In [5]:
sequences = tokenizer.texts_to_sequences(X)
max_len = max(len(seq) for seq in sequences)
print(f"Max sequence length: {max_len}")

padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

y_categorical = to_categorical(y, num_classes=6)

Max sequence length: 79


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, y_categorical, test_size=0.2, random_state=42, stratify=y
)

In [7]:
class_counts = [y.count(i) for i in range(6)]
total_samples = len(y)
class_weight_dict = {i: total_samples / (len(class_counts) * count) for i, count in enumerate(class_counts)}
print(f"Class weights: {class_weight_dict}")

Class weights: {0: 0.5732311771614668, 1: 0.492448032967786, 2: 2.010423298798017, 3: 1.2119993486516507, 4: 1.4559894086742677, 5: 4.63987220589545}


In [8]:
def build_bidirectional_lstm_model(vocab_size, max_len, embedding_dim=128, lstm_units=256, dropout_rate=0.5):
    # Input layer
    inputs = Input(shape=(max_len,))

    x = Embedding(input_dim=vocab_size,
                 output_dim=embedding_dim,
                 input_length=max_len,
                 mask_zero=True)(inputs)

    x = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)
    x = Dropout(dropout_rate)(x)
    x = Bidirectional(LSTM(lstm_units))(x)
    x = Dropout(dropout_rate)(x)

    outputs = Dense(6, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        loss='categorical_crossentropy',
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0003),
        metrics=['accuracy']
    )

    return model

In [9]:
model = build_bidirectional_lstm_model(
    vocab_size=vocab_size,
    max_len=max_len,
    embedding_dim=128,
    lstm_units=256,
    dropout_rate=0.5
)

# Model summary
model.summary()



In [10]:
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True,
        verbose=1
    ),
    ModelCheckpoint(
        'best_tf_emotion_model.h5',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    )
]

In [11]:
batch_size = 64
epochs = 30
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size=batch_size,
    class_weight=class_weight_dict,
    callbacks=callbacks
)

Epoch 1/30
[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.7713 - loss: 0.5249
Epoch 1: val_accuracy improved from -inf to 0.93317, saving model to best_tf_emotion_model.h5




[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 22ms/step - accuracy: 0.7713 - loss: 0.5249 - val_accuracy: 0.9332 - val_loss: 0.1392
Epoch 2/30
[1m5208/5211[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.9361 - loss: 0.1221
Epoch 2: val_accuracy improved from 0.93317 to 0.93707, saving model to best_tf_emotion_model.h5




[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 21ms/step - accuracy: 0.9361 - loss: 0.1221 - val_accuracy: 0.9371 - val_loss: 0.1217
Epoch 3/30
[1m5208/5211[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.9407 - loss: 0.1076
Epoch 3: val_accuracy improved from 0.93707 to 0.93833, saving model to best_tf_emotion_model.h5




[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 21ms/step - accuracy: 0.9407 - loss: 0.1076 - val_accuracy: 0.9383 - val_loss: 0.1187
Epoch 4/30
[1m5209/5211[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.9423 - loss: 0.1023
Epoch 4: val_accuracy improved from 0.93833 to 0.93856, saving model to best_tf_emotion_model.h5




[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 20ms/step - accuracy: 0.9423 - loss: 0.1023 - val_accuracy: 0.9386 - val_loss: 0.1250
Epoch 5/30
[1m5210/5211[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.9431 - loss: 0.1000
Epoch 5: val_accuracy did not improve from 0.93856
[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 20ms/step - accuracy: 0.9431 - loss: 0.1000 - val_accuracy: 0.9383 - val_loss: 0.1247
Epoch 6/30
[1m5209/5211[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.9444 - loss: 0.0974
Epoch 6: val_accuracy did not improve from 0.93856
[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 20ms/step - accuracy: 0.9444 - loss: 0.0974 - val_accuracy: 0.9382 - val_loss: 0.1386
Epoch 6: early stopping
Restoring model weights from the end of the best epoch: 3.


In [12]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Generate predictions
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

# Print classification report
target_names = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=target_names))

[1m2606/2606[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 6ms/step - accuracy: 0.9380 - loss: 0.1190
Test Loss: 0.1187
Test Accuracy: 0.9383
[1m2606/2606[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 5ms/step
Classification Report:
              precision    recall  f1-score   support

     sadness       1.00      0.95      0.97     24238
         joy       1.00      0.91      0.95     28214
        love       0.77      1.00      0.87      6911
       anger       0.92      0.96      0.94     11463
        fear       0.91      0.89      0.90      9542
    surprise       0.73      1.00      0.84      2994

    accuracy                           0.94     83362
   macro avg       0.89      0.95      0.91     83362
weighted avg       0.95      0.94      0.94     83362

