In [2]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp313-cp313-win_amd64.whl.metadata (4.6 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt_einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading termcolor-3.1.0-py3-none-any.whl.metadata (6.4 kB)
Collecting wrapt>=1.11.0 (from tensorflow)
  Downloading wrapt-1.17.3-cp313-cp313-win_am


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# Imports & Seed
import os
import random
import pickle
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print(tf.__version__)



2.20.0


In [2]:
df = pd.read_csv('reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.head()    

Unnamed: 0,clean_comment,category
0,"film absolutely awful, but nevertheless, hilar...",0
1,well since seeing part 1 3 honestly say never ...,0
2,got see film preview dazzled it. not typical r...,1
3,adaptation positively butcher classic beloved ...,0
4,rzone awful movie! simple. seems tried make mo...,0


In [3]:
# Ensure int labels (0/1)
df['category'] = df['category'].astype(int)

# Train-Test split (stratified)
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['clean_comment'],
    df['category'],
    test_size=0.2,
    random_state=SEED,
    stratify=df['category']
)

len(X_train_text), len(X_test_text), y_train.mean(), y_test.mean()

(19923, 4981, np.float64(0.5007779952818351), np.float64(0.5009034330455732))

In [4]:
# Tokenize + Pad (fit on train only)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_WORDS = 50000  # vocabulary cap (tune as needed)

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text.tolist())

# Sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_text.tolist())
X_test_seq  = tokenizer.texts_to_sequences(X_test_text.tolist())

# Determine a robust max_len (95th percentile capped at 300)
lengths = np.array([len(x) for x in X_train_seq])
max_len = int(min(300, np.percentile(lengths[lengths > 0], 95))) if (lengths > 0).any() else 50
max_len = max(max_len, 20)  # ensure minimum

print(f"Chosen max_len = {max_len}")

# Pad
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad  = pad_sequences(X_test_seq,  maxlen=max_len, padding='post', truncating='post')

X_train_pad.shape, X_test_pad.shape

Chosen max_len = 300


((19923, 300), (4981, 300))

In [5]:
# Class Weights
classes = np.array([0, 1])
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train.values)
class_weight_dict = {int(c): w for c, w in zip(classes, class_weights)}
class_weight_dict

{0: np.float64(1.0015584154433943), 1: np.float64(0.9984464267815977)}

In [6]:
# Optuna Objective (build & evaluate LSTM)
def build_lstm_model(
    vocab_size,
    max_len,
    embedding_dim=128,
    lstm_units=128,
    dense_units=64,
    dropout_rate=0.3,
    recurrent_dropout=0.0,
    lr=1e-3
):
    inputs = keras.Input(shape=(max_len,), dtype="int32")
    x = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)(inputs)
    x = layers.LSTM(lstm_units, dropout=dropout_rate, recurrent_dropout=recurrent_dropout, return_sequences=False)(x)
    x = layers.Dense(dense_units, activation="relu")(x)
    x = layers.Dropout(dropout_rate)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs, outputs)
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])
    return model


def objective(trial):
    # Hyperparameter search space
    embedding_dim     = trial.suggest_categorical("embedding_dim", [64, 128, 192, 256])
    lstm_units        = trial.suggest_categorical("lstm_units", [64, 96, 128, 160, 192])
    dense_units       = trial.suggest_categorical("dense_units", [32, 64, 96, 128])
    dropout_rate      = trial.suggest_float("dropout_rate", 0.1, 0.5)
    recurrent_dropout = trial.suggest_float("recurrent_dropout", 0.0, 0.3)
    lr                = trial.suggest_float("lr", 1e-4, 5e-3, log=True)
    batch_size        = trial.suggest_categorical("batch_size", [32, 48, 64, 96])
    epochs            = trial.suggest_int("epochs", 5, 15)

    # Train/Validation split from training set
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train_pad, y_train.values,
        test_size=0.15, random_state=SEED, stratify=y_train.values
    )

    model = build_lstm_model(
        vocab_size=min(MAX_WORDS, len(tokenizer.word_index) + 1),
        max_len=max_len,
        embedding_dim=embedding_dim,
        lstm_units=lstm_units,
        dense_units=dense_units,
        dropout_rate=dropout_rate,
        recurrent_dropout=recurrent_dropout,
        lr=lr
    )

    callbacks = [
        keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, verbose=0, min_lr=1e-5)
    ]

    history = model.fit(
        X_tr, y_tr,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        class_weight=class_weight_dict,
        verbose=0,
        callbacks=callbacks
    )

    # Evaluate on validation
    val_preds = (model.predict(X_val, batch_size=256, verbose=0) >= 0.5).astype(int).ravel()
    val_acc = accuracy_score(y_val, val_preds)

    # Report intermediate score
    trial.report(val_acc, step=epochs)

    return val_acc

In [None]:
# Run Optuna Study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25, show_progress_bar=True)

print("Best Trial:", study.best_trial.number)
print("Best Accuracy:", study.best_value)
print("Best Params:", study.best_params)

[I 2025-08-14 22:31:25,224] A new study created in memory with name: no-name-e40d8c1b-4498-4145-a1ac-ff9ac4e02e8e


  0%|          | 0/25 [00:00<?, ?it/s]

