In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Read training and validation datasets
train_seq_df = pd.read_csv("datasets/train/train_text_seq.csv")
train_seq_X = train_seq_df['input_str'].tolist()
train_seq_Y = train_seq_df['label'].tolist()

valid_seq_df = pd.read_csv("datasets/valid/valid_text_seq.csv")
valid_seq_X = valid_seq_df['input_str'].tolist()
valid_seq_Y = valid_seq_df['label'].tolist()

# Character-level tokenization
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(train_seq_X)  # Fit only on training data

# Convert sequences to integers
train_sequences = tokenizer.texts_to_sequences(train_seq_X)
valid_sequences = tokenizer.texts_to_sequences(valid_seq_X)

# Get max length from training data
max_len = max(len(seq) for seq in train_sequences)

# Pad sequences
X_train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
X_valid_padded = pad_sequences(valid_sequences, maxlen=max_len, padding='post')

# One-hot encode sequences
vocab_size = len(tokenizer.word_index) + 1
X_train = tf.keras.utils.to_categorical(X_train_padded, num_classes=vocab_size)
X_valid = tf.keras.utils.to_categorical(X_valid_padded, num_classes=vocab_size)

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_seq_Y)  # Fit on training labels
y_valid = label_encoder.transform(valid_seq_Y)      # Transform validation labels

# Build the LSTM model
model = Sequential([
    LSTM(64, input_shape=(max_len, vocab_size), recurrent_dropout=0.2),
    Dropout(0.3),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

# Add early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, 
    y_train,
    epochs=30,
    batch_size=32,
    validation_data=(X_valid, y_valid),
    callbacks=[early_stopping]
)

# Evaluate on validation set
test_loss, test_accuracy = model.evaluate(X_valid, y_valid)
print(f"\nFinal Test Accuracy: {test_accuracy*100:.2f}%")

# Optional: Make predictions on validation set
predictions = model.predict(X_valid)
predicted_labels = label_encoder.inverse_transform(predictions.argmax(axis=1))

Epoch 1/50




[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 66ms/step - accuracy: 0.5117 - loss: 0.8035 - val_accuracy: 0.5562 - val_loss: 0.6900 - learning_rate: 0.0010
Epoch 2/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 59ms/step - accuracy: 0.5286 - loss: 0.7125 - val_accuracy: 0.5174 - val_loss: 0.6982 - learning_rate: 0.0010
Epoch 3/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 60ms/step - accuracy: 0.5217 - loss: 0.7003 - val_accuracy: 0.5337 - val_loss: 0.6946 - learning_rate: 0.0010
Epoch 4/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 58ms/step - accuracy: 0.5475 - loss: 0.6966 - val_accuracy: 0.5399 - val_loss: 0.6911 - learning_rate: 0.0010
Epoch 5/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 78ms/step - accuracy: 0.5307 - loss: 0.6914 - val_accuracy: 0.5378 - val_loss: 0.6858 - learning_rate: 5.0000e-04
Epoch 6/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 