In [None]:
# Install essential packages (if not already)
!pip install -q mlflow imbalanced-learn optuna boto3 awscli

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
import optuna
import warnings
warnings.filterwarnings('ignore')


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/395.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
mlflow.set_tracking_uri('http://ec2-3-25-95-124.ap-southeast-2.compute.amazonaws.com:5000/')

<Experiment: artifact_location='s3://yt-mlflow-bkt/380195777977080659', creation_time=1751793285956, experiment_id='380195777977080659', last_update_time=1751793285956, lifecycle_stage='active', name='RF baseline model', tags={}>

In [None]:
# AKIATVPX5JRDSIFTBDVN
# cjQ4CGFcF6KDlpIF2IXAiFA2P/Wim6zOq/uPWgqD
# eu-north-1

In [None]:
!aws configure

AWS Access Key ID [None]: AKIATVPX5JRDSIFTBDVN
AWS Secret Access Key [None]: cjQ4CGFcF6KDlpIF2IXAiFA2P/Wim6zOq/uPWgqD
Default region name [None]: ap-southeast-2
Default output format [None]: 


In [None]:
# Load your cleaned dataset
data = pd.read_csv('preprocessed_data.csv')

# Features and labels
X = data['clean_comment']
y = data['category']

# Global train-test split (same split for all experiments)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape)

(29330,) (7333,)


###**Experiment 3: Imbalance Handling Techniques + max_features=1000**
**Goal**: Test different imbalance learning techniques:

- RandomUnderSampler

- SMOTE

- ADASYN

- SMOTEENN

- Class Weights (built into RandomForest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from collections import Counter

# Set experiment name
mlflow.set_experiment("exp3: imbalance learning techniques")

# Best vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Resampling configs
resampling_methods = {
    "RandomUnderSampler": RandomUnderSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42),
    "ADASYN": ADASYN(random_state=42),
    "SMOTEENN": SMOTEENN(random_state=42),
    "ClassWeight": None  # no resampling — handle via model param
}

for method_name, sampler in resampling_methods.items():
    with mlflow.start_run(run_name=f"RF_{method_name}"):
        mlflow.set_tag("experiment_type", "Imbalance Handling Test")
        mlflow.log_param("vectorizer_type", "TfidfVectorizer")
        mlflow.log_param("max_features", 1000)
        mlflow.log_param("resampling_method", method_name)

        if sampler:
            X_resampled, y_resampled = sampler.fit_resample(X_train_vec, y_train)
            mlflow.log_param("resampled_counts", dict(Counter(y_resampled)))
        else:
            X_resampled, y_resampled = X_train_vec, y_train

        # RandomForest (with or without class weights)
        if method_name == "ClassWeight":
            model = RandomForestClassifier(
                n_estimators=100, max_depth=15, class_weight='balanced', random_state=42
            )
        else:
            model = RandomForestClassifier(
                n_estimators=100, max_depth=15, random_state=42
            )

        model.fit(X_resampled, y_resampled)
        y_pred = model.predict(X_test_vec)

        # Accuracy
        acc = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", acc)

        # Log classification report metrics
        class_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in class_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # # Confusion matrix
        # conf_mat = confusion_matrix(y_test, y_pred)
        # plt.figure(figsize=(8,6))
        # sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues')
        # plt.title(f'Confusion Matrix: {method_name}')
        # plt.savefig(f'conf_matrix_{method_name}.png')
        # mlflow.log_artifact(f'conf_matrix_{method_name}.png')

        # # Log model
        # mlflow.sklearn.log_model(model, "rf_model")

    # Print just accuracy
    print(f"[RandomForest + {method_name}] Accuracy: {acc:.4f}")
