In [None]:
# Install essential packages (if not already)
!pip install -q mlflow imbalanced-learn optuna lightgbm boto3 awscli

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
import optuna
import warnings
warnings.filterwarnings('ignore')
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/395.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
mlflow.set_tracking_uri('http://ec2-3-25-95-124.ap-southeast-2.compute.amazonaws.com:5000/')

<Experiment: artifact_location='s3://yt-mlflow-bkt/380195777977080659', creation_time=1751793285956, experiment_id='380195777977080659', last_update_time=1751793285956, lifecycle_stage='active', name='RF baseline model', tags={}>

In [None]:
# AKIATVPX5JRDSIFTBDVN
# cjQ4CGFcF6KDlpIF2IXAiFA2P/Wim6zOq/uPWgqD
# eu-north-1

In [None]:
!aws configure

AWS Access Key ID [None]: AKIATVPX5JRDSIFTBDVN
AWS Secret Access Key [None]: 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
Default region name [None]: ap-southeast-2
Default output format [None]: 


###**Experiment 5: Detailed Hyperparameter Tuning on the Best Model derived from Experiment 4: LightGBM**


In [None]:
# Load dataset
data = pd.read_csv('preprocessed_data.csv')
data['category'] = data['category'].map({-1: 2, 0: 0, 1: 1})
X = data['clean_comment']
y = data['category']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Set MLflow experiment
mlflow.set_experiment("exp5: detailed tuning best model (LightGBM)")

# TFIDF Vectorizer (1,3) grams, max 1000 features
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# SMOTE resampling
sampler = SMOTE(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X_train_vec, y_train)

# Optuna objective function for LightGBM
def lgbm_objective(trial):
    with mlflow.start_run(run_name="LightGBM_DetailedTuning"):
        mlflow.set_tag("experiment_type", "Detailed Tuning")
        mlflow.log_param("model", "LightGBM")
        mlflow.log_param("vectorizer_type", "TFIDF")
        mlflow.log_param("max_features", 1000)

        # Detailed hyperparameter tuning space
        n_estimators = trial.suggest_int("n_estimators", 100, 800)
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 0.3, log=True)
        max_depth = trial.suggest_int("max_depth", 5, 50)
        num_leaves = trial.suggest_int("num_leaves", 20, 150)
        min_child_samples = trial.suggest_int("min_child_samples", 5, 100)
        subsample = trial.suggest_float("subsample", 0.5, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
        reg_alpha = trial.suggest_float("reg_alpha", 1e-3, 5.0, log=True) # L1 regularization
        reg_lambda = trial.suggest_float("reg_lambda", 1e-3, 5.0, log=True) # L2 regularization

        mlflow.log_params({
            "n_estimators": n_estimators,
            "learning_rate": learning_rate,
            "num_leaves": num_leaves,
            "max_depth": max_depth,
            "min_child_samples": min_child_samples,
            "subsample": subsample,
            "colsample_bytree": colsample_bytree,
            "reg_alpha": reg_alpha,
            "reg_lambda": reg_lambda
        })

        model = LGBMClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            max_depth=max_depth,
            min_child_samples=min_child_samples,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,
            random_state=42
        )

        model.fit(X_resampled, y_resampled)
        y_pred = model.predict(X_test_vec)

        acc = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", acc)

        # conf_mat = confusion_matrix(y_test, y_pred)
        # plt.figure(figsize=(8,6))
        # sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues')
        # plt.title('Confusion Matrix: LightGBM Detailed Tuning')
        # plt.savefig('conf_matrix_lgbm_detailed.png')
        # mlflow.log_artifact('conf_matrix_lgbm_detailed.png')

        mlflow.sklearn.log_model(model, "lgbm_model")

        return acc

# Run Optuna study for 100 detailed trials
study = optuna.create_study(direction="maximize")
study.optimize(lgbm_objective, n_trials=100)

# Print best accuracy found
print(f"[LightGBM Detailed Tuning] Best Accuracy: {study.best_value:.4f}")

optuna.visualization.plot_optimization_history(study).show()
optuna.visualization.plot_param_importances(study).show()
