In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import whylogs as why
from whylogs import ResultSet
from whylogs import log
from datetime import datetime
import shap
from skops.io import dump, load
import numpy as np

In [51]:
# 1. Load CSV
file_path = "data/mental_health_lite.csv"
df = pd.read_csv(file_path)

In [52]:
# 2. WhyLogs - Logging data awal

# 2. Pastikan kolom numerik bertipe numerik (jaga-jaga)
numeric_cols = ['age', 'stress_level', 'sleep_hours', 'physical_activity_days',
                'depression_score', 'anxiety_score', 'social_support_score', 'productivity_score']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

log(df).write(f"whylogs_profile_raw_fixed_{datetime.now().strftime('%Y%m%d_%H%M%S')}")

(True, 'whylogs_profile_raw_fixed_20250521_210448')

In [54]:
from whylogs.core.view import DatasetProfileView
from whylogs.core.metrics.metrics import DistributionMetric

profile_path = "whylogs_profile_raw_fixed_20250521_210448"  
profile_view = DatasetProfileView.read(path=profile_path)

columns = profile_view.get_columns()
for col in columns:
    col_view = profile_view.get_column(col)
    dist_metric = col_view.get_metric(DistributionMetric)
    if dist_metric:
        print(f"=== Kolom: {col} ===")
        print(f" Count : {dist_metric.kll.value.count}")
        print(f" Mean  : {dist_metric.kll.value.mean}")
        print(f" Stddev: {dist_metric.kll.value.stddev}")
        print(f" Min   : {dist_metric.kll.value.min}")
        print(f" Max   : {dist_metric.kll.value.max}")
        print()

In [26]:
# 3. Encode target
le_target = LabelEncoder()
df["mental_health_risk"] = le_target.fit_transform(df["mental_health_risk"])

In [27]:
# 4. Split fitur dan target
X = df.drop(columns=["mental_health_risk"])
y = df["mental_health_risk"]

In [28]:
# 5. Split data latih dan uji
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. WhyLogs - Logging data train
log(X_train).write(f"whylogs_profile_train_{datetime.now().strftime('%Y%m%d_%H%M%S')}")


(True, 'whylogs_profile_train_20250521_202624')

In [29]:
# 7. Encode fitur kategorikal
label_encoders = {}
for col in X_train.select_dtypes(include="object").columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    label_encoders[col] = le


In [30]:
# 8. Pipeline Random Forest
pipeline_rf = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(random_state=42))
])
pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)

In [31]:
# 9. Pipeline XGBoost
pipeline_xgb = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])
pipeline_xgb.fit(X_train, y_train)
y_pred_xgb = pipeline_xgb.predict(X_test)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [17]:
# 10. WhyLogs - Logging prediksi
pred_df = pd.DataFrame({
    "y_test": y_test,
    "y_pred_rf": y_pred_rf,
    "y_pred_xgb": y_pred_xgb
})
log(pred_df).write(f"whylogs_profile_pred_{datetime.now().strftime('%Y%m%d_%H%M%S')}")


(True, 'whylogs_profile_pred_20250521_085300')

In [18]:
# 11. Evaluasi model
print("Random Forest Report:")
print(classification_report(y_test, y_pred_rf, target_names=le_target.classes_))

print("\nXGBoost Report:")
print(classification_report(y_test, y_pred_xgb, target_names=le_target.classes_))

Random Forest Report:
              precision    recall  f1-score   support

        High       0.98      0.87      0.92        54
         Low       0.96      0.72      0.82        32
      Medium       0.88      0.98      0.93       115

    accuracy                           0.91       201
   macro avg       0.94      0.86      0.89       201
weighted avg       0.92      0.91      0.91       201


XGBoost Report:
              precision    recall  f1-score   support

        High       1.00      0.96      0.98        54
         Low       0.97      0.88      0.92        32
      Medium       0.95      0.99      0.97       115

    accuracy                           0.97       201
   macro avg       0.97      0.94      0.96       201
weighted avg       0.97      0.97      0.96       201



In [19]:
# Hitung akurasi masing-masing model
acc_rf = accuracy_score(y_test, y_pred_rf)
acc_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"Akurasi Random Forest: {acc_rf:.4f}")
print(f"Akurasi XGBoost: {acc_xgb:.4f}")

# Pilih model dengan akurasi terbaik
if acc_rf >= acc_xgb:
    best_model = pipeline_rf
    best_model_name = "random_forest"
    print("Model terbaik: Random Forest")
else:
    best_model = pipeline_xgb
    best_model_name = "xgboost"
    print("Model terbaik: XGBoost")

Akurasi Random Forest: 0.9104
Akurasi XGBoost: 0.9652
Model terbaik: XGBoost


In [20]:
# Simpan hanya model terbaik
from skops.io import dump
dump(best_model, f"model/model_best_{best_model_name}.skops")

In [49]:
import whylogs as why
from whylogs.core.view import DatasetProfileView
from whylogs.core.metrics.metrics import DistributionMetric
import os

# Path ke file profile yang disimpan
profile_path = "whylogs_profile_raw_20250521_202613"  # ganti dengan nama filemu
profile_view = DatasetProfileView.read(path=profile_path)

# Ambil semua kolom
columns = profile_view.get_columns()

# Loop semua kolom untuk tampilkan metrik distribusi (numerik)
for col in columns:
    col_view = profile_view.get_column(col)

    # Ambil metric bertipe distribusi (jika tersedia)
    dist_metric = col_view.get_metric(DistributionMetric)
    if dist_metric is not None:
        print(f"=== Kolom: {col} ===")
        print(f"  Count     : {dist_metric.kll.value.count}")
        print(f"  Mean      : {dist_metric.kll.value.mean}")
        print(f"  Stddev    : {dist_metric.kll.value.stddev}")
        print(f"  Min       : {dist_metric.kll.value.min}")
        print(f"  Max       : {dist_metric.kll.value.max}")
        print()