In [2]:
import pandas as pd
import numpy as np
import pickle
from sqlalchemy import create_engine

# Scikit-learn untuk preprocessing dan modeling
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Model dan Explainability
import xgboost as xgb
import shap

print("Semua library berhasil diimpor.")

  from .autonotebook import tqdm as notebook_tqdm


Semua library berhasil diimpor.


In [None]:
# Ganti dengan kredensial database PostgreSQL Anda
# Sebaiknya gunakan environment variables untuk menyimpan informasi ini
DB_USER = "postgres"
DB_PASSWORD = "DBmiko"
DB_HOST = "localhost"
DB_PORT = "5432"
DB_NAME = "dbexam"

# Buat koneksi ke database
db_engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Query untuk mengambil dan menggabungkan data
# Query ini melakukan PIVOT pada tabel assessment
sql_query = """
SELECT
    e.enroll_id,
    s.stu_id,
    s.gender,
    d.dept_name,
    c.course_name,
    cd.difficulty_level,
    a.attendance_percentage,
    -- Pivoting assessment scores
    MAX(CASE WHEN ass.assessment_type = 'Midterm' THEN ass.score END) AS score_midterm,
    MAX(CASE WHEN ass.assessment_type = 'Final' THEN ass.score END) AS score_final,
    MAX(CASE WHEN ass.assessment_type = 'Project' THEN ass.score END) AS score_project,
    e.grade
FROM enrollment e
LEFT JOIN student s ON e.stu_id = s.stu_id
LEFT JOIN department d ON s.dept_id = d.dept_id
LEFT JOIN course c ON e.course_id = c.course_id
LEFT JOIN course_difficulty cd ON e.course_id = cd.course_id
LEFT JOIN attendance a ON e.enroll_id = a.enroll_id
LEFT JOIN assessment ass ON e.enroll_id = ass.enroll_id
GROUP BY e.enroll_id, s.stu_id, s.gender, d.dept_name, c.course_name, cd.difficulty_level, a.attendance_percentage
ORDER BY e.enroll_id;
"""

# Baca data ke dalam DataFrame
df = pd.read_sql_query(sql_query, db_engine)

print(f"Data berhasil dimuat. Jumlah baris: {len(df)}")
df.head()

Data berhasil dimuat. Jumlah baris: 2105


Unnamed: 0,enroll_id,stu_id,gender,dept_name,course_name,difficulty_level,attendance_percentage,score_midterm,score_final,score_project,grade
0,1,1,Female,Information Technology,Course 1,Easy,65,59,100,78,80
1,2,1,Female,Information Technology,Course 2,Medium,66,71,64,65,76
2,3,1,Female,Information Technology,Course 9,Medium,72,45,35,35,32
3,4,1,Female,Information Technology,Course 7,Hard,40,37,39,45,32
4,5,1,Female,Information Technology,Course 6,Medium,99,78,58,58,99


In [4]:
# Membuat target variable 'status' (1 jika Berisiko, 0 jika Aman)
# Kita definisikan "Beresiko" jika nilai < 55
df['status'] = df['grade'].apply(lambda x: 1 if x < 55 else 0)

# Menghapus kolom asli 'grade' untuk menghindari data leakage
# Juga hapus kolom identifier yang tidak relevan untuk model
df = df.drop(columns=['grade', 'enroll_id', 'stu_id'])

print("Kolom target 'status' berhasil dibuat.")
df.info()

Kolom target 'status' berhasil dibuat.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2105 entries, 0 to 2104
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   gender                 2105 non-null   object
 1   dept_name              2105 non-null   object
 2   course_name            2105 non-null   object
 3   difficulty_level       2105 non-null   object
 4   attendance_percentage  2105 non-null   int64 
 5   score_midterm          2105 non-null   int64 
 6   score_final            2105 non-null   int64 
 7   score_project          2105 non-null   int64 
 8   status                 2105 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 148.1+ KB


In [6]:
# Pisahkan fitur (X) dan target (y)
X = df.drop('status', axis=1)
y = df['status']

# Bagi data menjadi 80% latih dan 20% uji
# `stratify=y` penting untuk memastikan proporsi target sama di data latih dan uji
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Data latih: {X_train.shape}")
print(f"Data uji: {X_test.shape}")

Data latih: (1684, 8)
Data uji: (421, 8)


In [7]:
# Identifikasi nama kolom numerik dan kategorikal
numeric_features = ['attendance_percentage', 'score_midterm', 'score_final', 'score_project']
categorical_features = ['gender', 'dept_name', 'course_name', 'difficulty_level']

# Pipeline untuk memproses data numerik
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Isi data kosong dengan median
    ('scaler', StandardScaler()) # Skala data
])

# Pipeline untuk memproses data kategorikal
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Isi data kosong dengan modus
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # Ubah ke One-Hot Encoding
])

# Gabungkan kedua pipeline dengan ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Biarkan kolom lain (jika ada) tidak diubah
)

print("Pipeline pra-pemrosesan berhasil dibuat.")

Pipeline pra-pemrosesan berhasil dibuat.


In [8]:
# Definisikan model yang akan digunakan
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Buat pipeline lengkap (preprocessing -> model)
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Latih pipeline pada data latih
print("Memulai pelatihan model...")
full_pipeline.fit(X_train, y_train)
print("Pelatihan model selesai.")

Memulai pelatihan model...
Pelatihan model selesai.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
# Buat prediksi pada data uji
y_pred = full_pipeline.predict(X_test)
y_pred_proba = full_pipeline.predict_proba(X_test)[:, 1]

# Tampilkan laporan klasifikasi
print("Laporan Klasifikasi:")
print(classification_report(y_test, y_pred))

# Tampilkan metrik lainnya
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

Laporan Klasifikasi:
              precision    recall  f1-score   support

           0       0.90      0.98      0.94       324
           1       0.92      0.62      0.74        97

    accuracy                           0.90       421
   macro avg       0.91      0.80      0.84       421
weighted avg       0.90      0.90      0.89       421

Accuracy Score: 0.9002
ROC AUC Score: 0.8158


In [None]:
# Simpan seluruh pipeline ke dalam satu file .pkl
# Ini adalah praktik terbaik karena sudah mencakup preprocessor dan model
MODEL_PATH = '../../ml_models/risk_model_development/miko_student_risk_pipeline.pkl' # Sesuaikan path

with open(MODEL_PATH, 'wb') as f:
    pickle.dump(full_pipeline, f)

print(f"Pipeline lengkap telah disimpan di: {MODEL_PATH}")

Pipeline lengkap telah disimpan di: ../../ml_models/miko_student_risk_pipeline.pkl


In [14]:
# Ambil preprocessor dan model dari pipeline
preprocessor_fitted = full_pipeline.named_steps['preprocessor']
model_fitted = full_pipeline.named_steps['model']

# Dapatkan nama fitur setelah diproses (terutama setelah OneHotEncoding)
feature_names_processed = preprocessor_fitted.get_feature_names_out()

# Buat SHAP Explainer khusus untuk model tree-based (XGBoost)
explainer = shap.TreeExplainer(model_fitted)

# Simpan explainer dan nama fitur
EXPLAINER_PATH = '../../ml_models/risk_model_development/miko_shap_explainer.pkl'
FEATURES_PATH = '../../ml_models/risk_model_development/miko_feature_names.pkl'

with open(EXPLAINER_PATH, 'wb') as f:
    pickle.dump(explainer, f)

with open(FEATURES_PATH, 'wb') as f:
    pickle.dump(feature_names_processed, f)

print(f"SHAP Explainer telah disimpan di: {EXPLAINER_PATH}")
print(f"Nama Fitur telah disimpan di: {FEATURES_PATH}")

SHAP Explainer telah disimpan di: ../../ml_models/risk_model_development/miko_shap_explainer.pkl
Nama Fitur telah disimpan di: ../../ml_models/risk_model_development/miko_feature_names.pkl
