In [None]:
# Install essential packages (if not already)
!pip install -q mlflow imbalanced-learn optuna lightgbm boto3 awscli

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import optuna
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/395.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
mlflow.set_tracking_uri('http://ec2-3-25-95-124.ap-southeast-2.compute.amazonaws.com:5000/')

<Experiment: artifact_location='s3://yt-mlflow-bkt/380195777977080659', creation_time=1751793285956, experiment_id='380195777977080659', last_update_time=1751793285956, lifecycle_stage='active', name='RF baseline model', tags={}>

In [None]:
# AKIATVPX5JRDSIFTBDVN
# cjQ4CGFcF6KDlpIF2IXAiFA2P/Wim6zOq/uPWgqD
# eu-north-1

In [None]:
!aws configure

AWS Access Key ID [None]: AKIATVPX5JRDSIFTBDVN
AWS Secret Access Key [None]: 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
Default region name [None]: ap-southeast-2
Default output format [None]: 


####lightgbm

In [None]:
# Load dataset
data = pd.read_csv('preprocessed_data.csv')
data['category'] = data['category'].map({-1: 2, 0: 0, 1: 1})
X = data['clean_comment']
y = data['category']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Set experiment
mlflow.set_experiment("exp4: multiple models with Optuna tuning")

# Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# SMOTE resampling
sampler = SMOTE(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X_train_vec, y_train)

# MLflow logging function
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        mlflow.set_tag('mlflow.runName', f'{model_name}_smote_tfidf_trigram')
        mlflow.set_tag('experiment_type', 'algorithm_comparison')
        mlflow.log_param('algo_name', model_name)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', acc)

        class_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in class_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # mlflow.sklearn.log_model(model, f'{model_name}_model')
    print(f"[{model_name}] Accuracy: {acc:.4f}")

# Optuna objective
def objective_lgbm(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 0.3, log=True)
    num_leaves = trial.suggest_int('num_leaves', 10, 100)

    model = LGBMClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        num_leaves=num_leaves,
        random_state=42
    )

    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    return acc

# Run Optuna
def run_optuna_lgbm():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_lgbm, n_trials=30)
    best_params = study.best_params

    best_model = LGBMClassifier(
        n_estimators=best_params['n_estimators'],
        learning_rate=best_params['learning_rate'],
        num_leaves=best_params['num_leaves'],
        random_state=42
    )

    log_mlflow('LightGBM', best_model, X_resampled, X_test_vec, y_resampled, y_test)
    print(f"[LightGBM] Best Accuracy from Optuna: {study.best_value:.4f}")

run_optuna_lgbm()
