In [1]:
!pip install mlflow boto3 awscli optuna xgboost imbalanced-learn

Collecting mlflow
  Downloading mlflow-3.1.4-py3-none-any.whl.metadata (29 kB)
Collecting boto3
  Downloading boto3-1.39.12-py3-none-any.whl.metadata (6.7 kB)
Collecting awscli
  Downloading awscli-1.41.12-py3-none-any.whl.metadata (11 kB)
Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting mlflow-skinny==3.1.4 (from mlflow)
  Downloading mlflow_skinny-3.1.4-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.4->mlflow)
  Downloading databricks_sdk-0.60.0-py3-none-any.whl.metadata (39 kB)
Collec

In [3]:
!aws configure

AWS Access Key ID [****************FFOP]: 
AWS Secret Access Key [****************ecyZ]: 
Default region name [ap-southeast-2]: 
Default output format [None]: 


In [4]:
import mlflow
mlflow.set_tracking_uri("http://ec2-3-107-12-217.ap-southeast-2.compute.amazonaws.com:5000/")

In [5]:
mlflow.set_experiment("XGBoost with HP Tuning")

2025/07/24 12:38:53 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost with HP Tuning' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://jee-371-bucket/5', creation_time=1753360733423, experiment_id='5', last_update_time=1753360733423, lifecycle_stage='active', name='XGBoost with HP Tuning', tags={}>

In [6]:
import pandas as pd
import mlflow
import mlflow.sklearn
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [7]:
df = pd.read_csv('/content/cleaned_data.csv').dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [8]:
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})
df = df.dropna(subset=['category'])

In [9]:
ngram_range = (1, 3)
max_features = 10000

X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'],
    df['category'],
    test_size=0.2,
    random_state=42,
    stratify=df['category']
)

In [10]:
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [11]:
smote = SMOTE(random_state=42)
X_train_vec, y_train = smote.fit_resample(X_train_vec, y_train)

## MLflow Logging per Run

- **Parameters**:
  - Vectorizer type: `TF-IDF`
  - N-gram range: `(1, 3)`
  - `max_features`: `10000`
  - Model hyperparameters:
    - `n_estimators` (varied via Optuna)
    - `learning_rate` (varied via Optuna)
    - `max_depth` (varied via Optuna)
  - Imbalance handling method: `SMOTE`

- **Metrics**:
  - Accuracy
  - Per-class Precision, Recall, and F1-score (from `classification_report`)


In [12]:
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        mlflow.set_tag("mlflow.runName", f"{model_name}_SMOTE_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")
        mlflow.log_param("algo_name", model_name)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))

        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        mlflow.sklearn.log_model(model, f"{model_name}_model")

In [13]:
def objective_xgboost(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
    max_depth = trial.suggest_int('max_depth', 3, 10)

    model = XGBClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    )

    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    return accuracy_score(y_test, y_pred)

In [14]:
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_xgboost, n_trials=30)

    best_params = study.best_params
    best_model = XGBClassifier(
        n_estimators=best_params['n_estimators'],
        learning_rate=best_params['learning_rate'],
        max_depth=best_params['max_depth'],
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    )

    log_mlflow("XGBoost", best_model, X_train_vec, X_test_vec, y_train, y_test)

run_optuna_experiment()

[I 2025-07-24 12:39:56,950] A new study created in memory with name: no-name-6863a900-9388-4d3d-925b-07e31acf6359
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 12:45:29,477] Trial 0 finished with value: 0.6499386335742534 and parameters: {'n_estimators': 141, 'learning_rate': 0.011770962843642015, 'max_depth': 7}. Best is trial 0 with value: 0.6499386335742534.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 12:49:11,527] Trial 1 finished with value: 0.5674348834037911 and parameters: {'n_estimators': 270, 'learning_rate': 0.002076533091631381, 'max_depth': 4}. Best is trial 0 with value: 0.6499386335742534.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 13:05:58,087] Trial 2 finished with value: 0.6393017864448384 and parameters: {'n_estimators': 281, 'learning_rate': 0.0028856907808394445, 'max_depth'

🏃 View run XGBoost_SMOTE_TFIDF_Trigrams at: http://ec2-3-107-12-217.ap-southeast-2.compute.amazonaws.com:5000/#/experiments/5/runs/d1d496196182480493cc28c62074cb25
🧪 View experiment at: http://ec2-3-107-12-217.ap-southeast-2.compute.amazonaws.com:5000/#/experiments/5
