In [5]:
import numpy as np
import pandas as pd 
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.optimizers import SGD, Adam
import bz2
import csv
from sklearn.metrics import roc_auc_score




In [7]:
# Load the training data 
training_data = bz2.BZ2File("C:\\Users\\raghu\\CS6120_mywork_NLP\\Fianl_project\\train.ft.txt.bz2")
training_data = training_data.readlines()
training_data = [x.decode('utf-8') for x in training_data]
print(len(training_data))

# Load the test data 
test_data = bz2.BZ2File("C:\\Users\\raghu\\CS6120_mywork_NLP\\Fianl_project\\test.ft.txt.bz2")
test_data = test_data.readlines()
test_data = [x.decode('utf-8') for x in test_data]
print(len(test_data))

3600000
400000


In [8]:
# Split the data into labels and texts
traing_labels = [int(re.findall(r'__label__(\d)', line)[0]) for line in training_data]
trainig_texts = [re.sub(r'__label__\d ', '', line) for line in training_data]

test_labels = [int(re.findall(r'__label__(\d)', line)[0]) for line in test_data]
test_texts = [re.sub(r'__label__\d ', '', line) for line in test_data]

# Convert labels to binary (0 and 1)
traing_labels = [0 if label == 1 else 1 for label in traing_labels]
test_labels = [0 if label == 1 else 1 for label in test_labels]

In [9]:
# Tokenization and padding
max_words = 1000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(trainig_texts)

X_train = tokenizer.texts_to_sequences(trainig_texts)
X_test = tokenizer.texts_to_sequences(test_texts)

X_train = pad_sequences(X_train, maxlen=max_sequence_length)
X_test = pad_sequences(X_test, maxlen=max_sequence_length)

In [10]:
X_train = np.array(X_train)
print(X_train.shape)
X_test = np.array(X_test)
print(X_test.shape)
y_train = np.array(traing_labels)
print(y_train.shape)
y_test = np.array(test_labels)
print(y_test.shape)

(3600000, 100)
(400000, 100)
(3600000,)
(400000,)


In [12]:
def create_model(optimizer, activation_function, lstm_layers):
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
    for _ in range(lstm_layers):
        model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128, return_sequences=False))
    model.add(Dense(1, activation=activation_function))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Define different configurations
configurations = [
    {"optimizer": SGD(learning_rate=0.1), "activation_function": 'sigmoid', "lstm_layers": 1},
    # {"optimizer": Adam(learning_rate=0.001), "activation_function": 'relu', "lstm_layers": 2},
]

results = []

for config in configurations:
    model = create_model(config["optimizer"], config["activation_function"], config["lstm_layers"])
    model.fit(X_train, y_train, epochs=1, batch_size=2048, verbose=1)
    loss, accuracy = model.evaluate(X_test, y_test)
    results.append({"config": config, "loss": loss, "accuracy": accuracy})



In [13]:
for result in results:
    config = result["config"]
    loss = result["loss"]
    accuracy = result["accuracy"]
    print(f"Configuration: Optimizer={config['optimizer']}, Activation={config['activation_function']}, LSTM Layers={config['lstm_layers']}")
    print(f"Test Loss: {loss}, Test Accuracy: {accuracy}\n")

Configuration: Optimizer=<keras.src.optimizers.sgd.SGD object at 0x00000218729F8190>, Activation=sigmoid, LSTM Layers=1
Test Loss: 0.48623278737068176, Test Accuracy: 0.7682899832725525



In [14]:
# Evaluate with a confusion matrix and classification report
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Confusion Matrix:
[[168966  31034]
 [ 61650 138350]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.84      0.78    200000
           1       0.82      0.69      0.75    200000

    accuracy                           0.77    400000
   macro avg       0.77      0.77      0.77    400000
weighted avg       0.77      0.77      0.77    400000

