In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pickle
import optuna
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.head()

Unnamed: 0,clean_comment,category
0,"film absolutely awful, but nevertheless, hilar...",0
1,well since seeing part 1 3 honestly say never ...,0
2,got see film preview dazzled it. not typical r...,1
3,adaptation positively butcher classic beloved ...,0
4,rzone awful movie! simple. seems tried make mo...,0


In [3]:
X = df['clean_comment']
y = df['category'] 

In [4]:
# Train-Test Split + TF-IDF + SMOTE

ngram_range = (1, 3)  # Trigram
max_features = 10000

X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'], df['category'],
    test_size=0.2, random_state=42, stratify=df['category']
)

vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_vec, y_train = smote.fit_resample(X_train_vec, y_train)

print("Training shape after SMOTE:", X_train_vec.shape)

Training shape after SMOTE: (19954, 10000)


In [5]:
# Optuna objective function for XGBoost

def objective_xgboost(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'random_state': 42,
        'eval_metric': 'logloss',
        'use_label_encoder': False
    }

    model = XGBClassifier(**params)
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    return accuracy_score(y_test, preds)

In [6]:
# Run Optuna for Hyperparameter Tuning
study = optuna.create_study(direction="maximize")
study.optimize(objective_xgboost, n_trials=20)  # Adjust n_trials if needed

print("Best Parameters:", study.best_params)
print("Best Accuracy:", study.best_value)

# Train final model with best params
best_model = XGBClassifier(**study.best_params, random_state=42, eval_metric='logloss', use_label_encoder=False)
best_model.fit(X_train_vec, y_train)

[I 2025-08-14 13:02:17,321] A new study created in memory with name: no-name-011256ab-9fa6-41bb-8a6a-35c5c5ce679c
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-08-14 13:04:25,061] Trial 0 finished with value: 0.6952419192933146 and parameters: {'n_estimators': 259, 'learning_rate': 0.0002861731505246053, 'max_depth': 4, 'subsample': 0.7359349934180666, 'colsample_bytree': 0.9757923054243242}. Best is trial 0 with value: 0.6952419192933146.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-08-14 13:10:14,070] Trial 1 finished with value: 0.8138927926119253 and parameters: {'n_estimators': 123, 'learning_rate': 0.02500078407130143, 'max_depth': 10, 'subsample': 0.8293163136590181, 'colsample_bytree': 0.8519817905493616}. Best is trial 1 with value: 0.8138927926119253.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-08-14 13:1

Best Parameters: {'n_estimators': 150, 'learning_rate': 0.09011526554823635, 'max_depth': 9, 'subsample': 0.9933283305239757, 'colsample_bytree': 0.6050847927667957}
Best Accuracy: 0.8514354547279662


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
