In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# LOAD DATASET
csv_path = "/content/drive/MyDrive/PBL SEM 5/Dataset ML/V2/DATASET/dataset_feature_engineered.csv"
df = pd.read_csv(csv_path)

print("\n===== Info Dataset =====")
print(df.info())


===== Info Dataset =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   No_Reff                9000 non-null   int64  
 1   Nominal_Transaksi      9000 non-null   int64  
 2   Jenis_Transaksi        9000 non-null   object 
 3   Timestamp              9000 non-null   object 
 4   Nama_Pengirim          9000 non-null   object 
 5   Nama_Penerima          9000 non-null   object 
 6   Tanggal                9000 non-null   object 
 7   Bulan                  9000 non-null   int64  
 8   Hari                   9000 non-null   int64  
 9   Hari_Minggu            9000 non-null   int64  
 10  Minggu_Ke              9000 non-null   int64  
 11  Is_Weekend             9000 non-null   int64  
 12  Is_Akhir_Bulan         9000 non-null   int64  
 13  Is_Awal_Bulan          9000 non-null   int64  
 14  Quarter                9000 no

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

print("="*60)
print("PERSIAPAN DATA UNTUK MODELING")
print("="*60)

# --- 1. PILIH FITUR YANG RELEVAN ---
print("\n[1] Memilih fitur relevan...")

# Fitur yang TIDAK dipakai (drop):
drop_columns = [
    'No_Reff',           # ID, tidak relevan
    'Timestamp',         # Sudah dipecah jadi fitur temporal
    'Tanggal',           # Sudah dipecah
    'Nama_Pengirim',     # Akan di-encode terpisah
    'Nama_Penerima',     # Selalu sama (RW05)
    'Jenis_Transaksi',   # Sudah di one-hot encode (Is_TopUp, Is_QRIS, Is_Transfer)
    'Quarter_Label',     # Redundan dengan Quarter (numeric)
    'Warning_Level',     # Ini output dari Risk_Score, bukan input
    'Minggu_Ke'          # Kurang informatif untuk prediksi pembayaran
]

# Fitur yang DIPAKAI untuk model:
feature_columns = [
    # Temporal Features
    'Bulan', 'Hari', 'Hari_Minggu', 'Quarter',
    'Is_Weekend', 'Is_Akhir_Bulan', 'Is_Awal_Bulan',
    'Hari_Dari_Awal_Bulan',

    # Behavioral Features (Histori Warga)
    'Total_Transaksi', 'Rata_Nominal', 'Frekuensi_Per_Hari',
    'Durasi_Aktif_Hari', 'Rata_Interval_Hari',
    'Jumlah_Terlambat', 'Persentase_Terlambat',

    # Transaction Type Features
    'Is_TopUp', 'Is_QRIS', 'Is_Transfer',
    'Prop_TopUp', 'Prop_QRIS', 'Prop_Transfer',

    # Activity Features
    'Aktivitas_Bulan_Ini', 'Aktivitas_Quarter_Ini',

    # Transaction Amount
    'Nominal_Transaksi'
]

# Target untuk 2 model:
target_classification = 'Kategori_Pembayaran'  # Model 1
target_regression = 'Risk_Score'                # Model 2

print(f"Jumlah fitur input: {len(feature_columns)}")
print(f"Target Classification: {target_classification}")
print(f"Target Regression: {target_regression}")

# --- 2. BUAT DATASET UNTUK MASING-MASING MODEL ---

# Copy dataframe
df_modeling = df.copy()

# Encode Nama_Pengirim (warga) sebagai categorical
le_warga = LabelEncoder()
df_modeling['Warga_ID'] = le_warga.fit_transform(df_modeling['Nama_Pengirim'])

# Tambahkan Warga_ID ke feature columns
feature_columns.append('Warga_ID')

print(f"\n[2] Encoding kategorikal...")
print(f"Total unique warga: {df_modeling['Warga_ID'].nunique()}")

# --- 3. DATASET UNTUK MODEL CLASSIFICATION ---
print(f"\n[3] Persiapan data Classification...")

X_class = df_modeling[feature_columns].copy()
y_class = df_modeling[target_classification].copy()

# Encode target classification
le_target = LabelEncoder()
y_class_encoded = le_target.fit_transform(y_class)

# Mapping label
class_mapping = dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))
print(f"Class mapping: {class_mapping}")
print(f"Distribusi target:")
print(df_modeling[target_classification].value_counts())

# Train-test split (80:20)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_class_encoded
)

print(f"\nTrain size: {len(X_train_class)} ({len(X_train_class)/len(X_class)*100:.1f}%)")
print(f"Test size: {len(X_test_class)} ({len(X_test_class)/len(X_class)*100:.1f}%)")
print(f"Distribusi train:\n{pd.Series(y_train_class).value_counts()}")
print(f"Distribusi test:\n{pd.Series(y_test_class).value_counts()}")

# --- 4. DATASET UNTUK MODEL REGRESSION ---
print(f"\n[4] Persiapan data Regression...")

X_reg = df_modeling[feature_columns].copy()
y_reg = df_modeling[target_regression].copy()

print(f"Statistik Risk_Score:")
print(y_reg.describe())

# Train-test split (80:20)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg,
    test_size=0.2,
    random_state=42
)

print(f"\nTrain size: {len(X_train_reg)} ({len(X_train_reg)/len(X_reg)*100:.1f}%)")
print(f"Test size: {len(X_test_reg)} ({len(X_test_reg)/len(X_reg)*100:.1f}%)")

# --- 5. SIMPAN DATA UNTUK MODELING ---
print(f"\n[5] Menyimpan dataset...")

# Simpan mapping warga (untuk inference nanti)
warga_mapping = pd.DataFrame({
    'Nama_Pengirim': le_warga.classes_,
    'Warga_ID': range(len(le_warga.classes_))
})

# Simpan ke CSV
base_path = '/content/drive/MyDrive/PBL SEM 5/Dataset ML/V2/DATASET/'

# Classification dataset
X_train_class.to_csv(base_path + 'X_train_classification.csv', index=False)
X_test_class.to_csv(base_path + 'X_test_classification.csv', index=False)
pd.DataFrame(y_train_class, columns=['target']).to_csv(base_path + 'y_train_classification.csv', index=False)
pd.DataFrame(y_test_class, columns=['target']).to_csv(base_path + 'y_test_classification.csv', index=False)

# Regression dataset
X_train_reg.to_csv(base_path + 'X_train_regression.csv', index=False)
X_test_reg.to_csv(base_path + 'X_test_regression.csv', index=False)
pd.DataFrame(y_train_reg, columns=['target']).to_csv(base_path + 'y_train_regression.csv', index=False)
pd.DataFrame(y_test_reg, columns=['target']).to_csv(base_path + 'y_test_regression.csv', index=False)

# Mapping files
warga_mapping.to_csv(base_path + 'warga_mapping.csv', index=False)
pd.DataFrame({
    'class': le_target.classes_,
    'encoded': le_target.transform(le_target.classes_)
}).to_csv(base_path + 'class_mapping.csv', index=False)

print("\n‚úì File disimpan:")
print(f"  - X_train_classification.csv ({X_train_class.shape})")
print(f"  - X_test_classification.csv ({X_test_class.shape})")
print(f"  - y_train_classification.csv")
print(f"  - y_test_classification.csv")
print(f"  - X_train_regression.csv ({X_train_reg.shape})")
print(f"  - X_test_regression.csv ({X_test_reg.shape})")
print(f"  - y_train_regression.csv")
print(f"  - y_test_regression.csv")
print(f"  - warga_mapping.csv")
print(f"  - class_mapping.csv")

# --- 6. RINGKASAN FINAL ---
print("\n" + "="*60)
print("RINGKASAN DATA TRAINING")
print("="*60)

print(f"\nFITUR INPUT ({len(feature_columns)} fitur):")
for i, feat in enumerate(feature_columns, 1):
    print(f"  {i:2d}. {feat}")

print(f"\nMODEL 1: CLASSIFICATION")
print(f"  Target: {target_classification}")
print(f"  Classes: {list(class_mapping.keys())}")
print(f"  Train: {X_train_class.shape}, Test: {X_test_class.shape}")

print(f"\nMODEL 2: REGRESSION")
print(f"  Target: {target_regression} (0-100)")
print(f"  Train: {X_train_reg.shape}, Test: {X_test_reg.shape}")

print(f"\nSemua file tersimpan di:")
print(f"  {base_path}")

# --- 7. SAMPLE DATA CHECK ---
print("\nSample X_train_classification (5 baris):")
print(X_train_class.head())

print("\nSample y_train_classification (10 baris):")
print(pd.DataFrame(y_train_class[:10], columns=['target']))
print(f"\n  0 = {le_target.inverse_transform([0])[0]}")
print(f"  1 = {le_target.inverse_transform([1])[0]}")
print(f"  2 = {le_target.inverse_transform([2])[0]}")

print("\nData siap untuk training!")

PERSIAPAN DATA UNTUK MODELING

[1] Memilih fitur relevan...
Jumlah fitur input: 24
Target Classification: Kategori_Pembayaran
Target Regression: Risk_Score

[2] Encoding kategorikal...
Total unique warga: 156

[3] Persiapan data Classification...
Class mapping: {'Mendekati_Deadline': np.int64(0), 'Tepat_Waktu': np.int64(1), 'Terlambat': np.int64(2)}
Distribusi target:
Kategori_Pembayaran
Terlambat             6333
Tepat_Waktu           1845
Mendekati_Deadline     822
Name: count, dtype: int64

Train size: 7200 (80.0%)
Test size: 1800 (20.0%)
Distribusi train:
2    5066
1    1476
0     658
Name: count, dtype: int64
Distribusi test:
2    1267
1     369
0     164
Name: count, dtype: int64

[4] Persiapan data Regression...
Statistik Risk_Score:
count    9000.000000
mean       54.124444
std        16.074721
min        21.395349
25%        38.484848
50%        58.732394
75%        65.925926
max        85.000000
Name: Risk_Score, dtype: float64

Train size: 7200 (80.0%)
Test size: 1800 (20.0%

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.metrics import (f1_score, roc_auc_score, precision_score,
                             recall_score, confusion_matrix, accuracy_score,
                             classification_report)
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import joblib
import os
from datetime import datetime


# ===================================================================
# TRAINING BASE MODELS
# ===================================================================
print("\n" + "="*70)
print("ü§ñ TRAINING BASE MODELS")
print("="*70)


# -------------------------------------------------------------
# 1Ô∏è‚É£ XGBoost
# -------------------------------------------------------------
print("\n1Ô∏è‚É£ Training XGBoost...")

xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softprob', # Correct for multiclass probabilities
    eval_metric='mlogloss',     # Correct for multiclass logloss
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=30,
    num_class=3,                # Specify number of classes
)

xgb_model.fit(
    X_train_class, y_train_class,
    eval_set=[(X_test_class, y_test_class)],
    verbose=False # Set to False to reduce output during execution
)

# Get predicted labels and probabilities for multiclass
xgb_pred_proba_train = xgb_model.predict_proba(X_train_class) # All class probabilities
xgb_pred_proba_test  = xgb_model.predict_proba(X_test_class)  # All class probabilities
xgb_pred_labels_test = xgb_model.predict(X_test_class)        # Predicted class labels

xgb_f1  = f1_score(y_test_class, xgb_pred_labels_test, average='weighted') # Use weighted average for multiclass F1
xgb_auc = roc_auc_score(y_test_class, xgb_pred_proba_test, multi_class='ovr', average='weighted') # Use OvR for multiclass AUC

print(f"   ‚úÖ Best iteration: {xgb_model.best_iteration}")
print(f"   üìä F1-Score (weighted): {xgb_f1:.4f} | ROC-AUC (OvR weighted): {xgb_auc:.4f}")


# -------------------------------------------------------------
# 2Ô∏è‚É£ CatBoost
# -------------------------------------------------------------
print("\n2Ô∏è‚É£ Training CatBoost...")

cat_model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    loss_function='MultiClass', # Changed to MultiClass for multiclass problem
    eval_metric='MultiClass',   # Changed for multiclass evaluation
    random_state=42,
    verbose=False,
    early_stopping_rounds=30,
    classes_count=3 # Specify number of classes
)

cat_model.fit(X_train_class, y_train_class, eval_set=(X_test_class, y_test_class))

# Get predicted labels and probabilities for multiclass
cat_pred_proba_train = cat_model.predict_proba(X_train_class)
cat_pred_proba_test  = cat_model.predict_proba(X_test_class)
cat_pred_labels_test = cat_model.predict(X_test_class)

cat_f1  = f1_score(y_test_class, cat_pred_labels_test, average='weighted')
cat_auc = roc_auc_score(y_test_class, cat_pred_proba_test, multi_class='ovr', average='weighted')

print(f"   ‚úÖ Best iteration: {cat_model.best_iteration_}")
print(f"   üìä F1-Score (weighted): {cat_f1:.4f} | ROC-AUC (OvR weighted): {cat_auc:.4f}")


# -------------------------------------------------------------
# 3Ô∏è‚É£ LightGBM
# -------------------------------------------------------------
print("\n3Ô∏è‚É£ Training LightGBM...")

lgb_model = lgb.LGBMClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multiclass',
    num_class=3,
    metric='multi_logloss',
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

lgb_model.fit(
    X_train_class, y_train_class,
    eval_set=[(X_test_class, y_test_class)],
    callbacks=[lgb.early_stopping(stopping_rounds=30)]
)

# Get predicted labels and probabilities for multiclass
lgb_pred_proba_train = lgb_model.predict_proba(X_train_class)
lgb_pred_proba_test  = lgb_model.predict_proba(X_test_class)
lgb_pred_labels_test = lgb_model.predict(X_test_class)

lgb_f1  = f1_score(y_test_class, lgb_pred_labels_test, average='weighted')
lgb_auc = roc_auc_score(y_test_class, lgb_pred_proba_test, multi_class='ovr', average='weighted')

print(f"   ‚úÖ Best iteration: {lgb_model.best_iteration_}")
print(f"   üìä F1-Score (weighted): {lgb_f1:.4f} | ROC-AUC (OvR weighted): {lgb_auc:.4f}")


# ===================================================================
# META-LEARNER (ENSEMBLE)
# ===================================================================
print("\n" + "="*70)
print("üéØ TRAINING META-LEARNER (ENSEMBLE)")
print("="*70)

# Stack probabilities from all classes for each base model
meta_train = np.column_stack([xgb_pred_proba_train, cat_pred_proba_train, lgb_pred_proba_train])
meta_test  = np.column_stack([xgb_pred_proba_test,  cat_pred_proba_test,  lgb_pred_proba_test])

meta_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

meta_model.fit(meta_train, y_train_class)

# Get predicted labels and probabilities for the meta-learner
meta_pred_proba = meta_model.predict_proba(meta_test) # Probabilities for all classes
meta_pred       = meta_model.predict(meta_test)       # Predicted class labels


# ===================================================================
# FINAL EVALUATION
# ===================================================================
# Calculate multiclass metrics
accuracy  = accuracy_score(y_test_class, meta_pred)
f1        = f1_score(y_test_class, meta_pred, average='weighted')       # Weighted F1
precision = precision_score(y_test_class, meta_pred, average='weighted') # Weighted Precision
recall    = recall_score(y_test_class, meta_pred, average='weighted')    # Weighted Recall
roc_auc   = roc_auc_score(y_test_class, meta_pred_proba, multi_class='ovr', average='weighted') # OvR weighted AUC

print("\n" + "="*70)
print("üìä FINAL EVALUATION (ENSEMBLE)")
print("="*70)
print(f"Accuracy:  {accuracy:.4f}")
print(f"F1-Score (weighted):  {f1:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted):    {recall:.4f}")
print(f"ROC-AUC (OvR weighted):   {roc_auc:.4f}")

print("\nüìã Classification Report:")
# Define target names based on class mapping from previous cell
# {'Mendekati_Deadline': np.int64(0), 'Tepat_Waktu': np.int64(1), 'Terlambat': np.int64(2)}
target_names_map = {0: 'Mendekati_Deadline', 1: 'Tepat_Waktu', 2: 'Terlambat'}
sorted_target_names = [target_names_map[i] for i in sorted(target_names_map.keys())]

print(classification_report(
    y_test_class, meta_pred,
    target_names=sorted_target_names
))


# ===================================================================
# FEATURE IMPORTANCE
# ===================================================================
print("\n" + "="*70)
print("üìä TOP 10 FEATURE IMPORTANCE (XGBoost)")
print("="*70)

fi = pd.DataFrame({
    'Feature': X_train_class.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print(fi.head(10).to_string(index=False))


# ===================================================================
# SAVE MODELS FOR INFERENCE (NO TIMESTAMP VERSION)
# ===================================================================
print("\n" + "="*70)
print("üíæ SAVING MODELS FOR INFERENCE")
print("="*70)

model_dir = "/content/drive/MyDrive/PBL SEM 5/Dataset ML/models_ews"
os.makedirs(model_dir, exist_ok=True)

# ------------------------------
# 1. Save Base Models
# ------------------------------
joblib.dump(xgb_model, f"{model_dir}/xgb_classifier.pkl")
joblib.dump(cat_model, f"{model_dir}/cat_classifier.pkl")
joblib.dump(lgb_model, f"{model_dir}/lgb_classifier.pkl")

# ------------------------------
# 2. Save Meta Learner (Stacking Model)
# ------------------------------
joblib.dump(meta_model, f"{model_dir}/meta_classifier.pkl")

# ------------------------------
# 4. Save Label Encoder
# ------------------------------
joblib.dump(le_target, f"{model_dir}/encoder_target.pkl")

# ------------------------------
# 5. Save Model Info
# ------------------------------

# Convert np.int64 to Python int
class_mapping_clean = {
    str(k): int(v) for k, v in zip(le_target.classes_, le_target.transform(le_target.classes_))
}

model_info = {
    "model_version": "1.0",
    "num_classes": int(len(np.unique(y_train_class))),   # avoid np types
    "feature_columns": list(X_train_class.columns),
    "class_mapping": class_mapping_clean,
    "base_models": ["xgb_classifier", "cat_classifier", "lgb_classifier"],
    "meta_model": "meta_classifier"
}

import json
with open(f"{model_dir}/model_info.json", "w") as f:
    json.dump(model_info, f, indent=4)

print("\n‚úÖ Model artifacts saved:")
print("   üìÅ xgb_classifier.pkl")
print("   üìÅ cat_classifier.pkl")
print("   üìÅ lgb_classifier.pkl")
print("   üìÅ meta_classifier.pkl")
print("   üìÅ encoder_target.pkl")
print("   üìÅ model_info.json")

print("\nüìå Saved at:", model_dir)


ü§ñ TRAINING BASE MODELS

1Ô∏è‚É£ Training XGBoost...
   ‚úÖ Best iteration: 360
   üìä F1-Score (weighted): 1.0000 | ROC-AUC (OvR weighted): 1.0000

2Ô∏è‚É£ Training CatBoost...
   ‚úÖ Best iteration: 499
   üìä F1-Score (weighted): 1.0000 | ROC-AUC (OvR weighted): 1.0000

3Ô∏è‚É£ Training LightGBM...
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[239]	valid_0's multi_logloss: 2.44068e-07
   ‚úÖ Best iteration: 239
   üìä F1-Score (weighted): 1.0000 | ROC-AUC (OvR weighted): 1.0000

üéØ TRAINING META-LEARNER (ENSEMBLE)

üìä FINAL EVALUATION (ENSEMBLE)
Accuracy:  1.0000
F1-Score (weighted):  1.0000
Precision (weighted): 1.0000
Recall (weighted):    1.0000
ROC-AUC (OvR weighted):   1.0000

üìã Classification Report:
                    precision    recall  f1-score   support

Mendekati_Deadline       1.00      1.00      1.00       164
       Tepat_Waktu       1.00      1.00      1.00       369
         Terlambat       1.00      1

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import joblib
import warnings
warnings.filterwarnings('ignore')

# =====================================================
# 1. LOAD MODELS & METADATA
# =====================================================

import json

model_dir = '/content/drive/MyDrive/PBL SEM 5/Dataset ML/models_ews/Classification'

# Load Classification Models
xgb_model = joblib.load(f"{model_dir}/xgb_classifier.pkl")
cat_model = joblib.load(f"{model_dir}/cat_classifier.pkl")
lgb_model = joblib.load(f"{model_dir}/lgb_classifier.pkl")
meta_model = joblib.load(f"{model_dir}/meta_classifier.pkl")

# Load encoder untuk target (Kategori_Pembayaran)
encoder_target = joblib.load(f"{model_dir}/encoder_target.pkl")

# Load model info (berisi feature_columns dan metadata lainnya)
with open(f"{model_dir}/model_info.json", 'r') as f:
    model_info = json.load(f)
    feature_columns = model_info['feature_columns']

print("‚úÖ Models & metadata loaded!")
print(f"   - Total features: {len(feature_columns)}")
print(f"   - Target classes: {encoder_target.classes_}")

# =====================================================
# 2. LOAD HISTORY & MAPPINGS
# =====================================================

base_path = '/content/drive/MyDrive/PBL SEM 5/Dataset ML/V2/DATASET/'

# Load historical transaction data
df_history = pd.read_csv('/content/drive/MyDrive/PBL SEM 5/Dataset ML/V2/DATASET/dataset_feature_engineered.csv')
df_history['Timestamp'] = pd.to_datetime(df_history['Timestamp'])

# Load warga mapping (jika ada)
try:
    warga_mapping = pd.read_csv(base_path + 'warga_mapping.csv')
    warga_id_map = dict(zip(warga_mapping['Nama_Pengirim'], warga_mapping['Warga_ID']))
    print("‚úÖ Warga mapping loaded!")
except FileNotFoundError:
    print("‚ö†Ô∏è warga_mapping.csv tidak ditemukan. Membuat dari data histori...")
    unique_warga = df_history['Nama_Pengirim'].unique()
    warga_mapping = pd.DataFrame({
        'Nama_Pengirim': unique_warga,
        'Warga_ID': range(len(unique_warga))
    })
    warga_id_map = dict(zip(warga_mapping['Nama_Pengirim'], warga_mapping['Warga_ID']))
    warga_mapping.to_csv(base_path + 'warga_mapping.csv', index=False)
    print(f"‚úÖ Warga mapping dibuat! Total: {len(unique_warga)} warga")

print("‚úÖ Historical data & mappings loaded!\n")

# =====================================================
# 3. FEATURE ENGINEERING (SESUAI TRAINING)
# =====================================================

def create_features_for_inference(tanggal_transaksi, nominal_transaksi, jenis_transaksi, nama_warga):
    """
    Membuat fitur sesuai dengan yang digunakan saat training
    """

    if isinstance(tanggal_transaksi, str):
        tanggal_transaksi = datetime.strptime(tanggal_transaksi, '%Y-%m-%d')

    # Temporal Features
    bulan = tanggal_transaksi.month
    hari = tanggal_transaksi.day
    hari_minggu = tanggal_transaksi.weekday()
    quarter = (bulan - 1) // 3 + 1

    is_weekend = 1 if hari_minggu >= 5 else 0
    is_akhir_bulan = 1 if hari >= 25 else 0
    is_awal_bulan = 1 if hari <= 5 else 0
    hari_dari_awal_bulan = hari

    # Behavioral Features (HISTORI WARGA)
    warga_hist = df_history[df_history['Nama_Pengirim'] == nama_warga].sort_values('Timestamp')

    if len(warga_hist) > 0:
        total_transaksi = len(warga_hist)
        rata_nominal = warga_hist['Nominal_Transaksi'].mean()

        first_date = warga_hist['Timestamp'].min()
        durasi_aktif_hari = (tanggal_transaksi - first_date).days

        frekuensi_per_hari = total_transaksi / durasi_aktif_hari if durasi_aktif_hari > 0 else 0

        if len(warga_hist) > 1:
            intervals = warga_hist['Timestamp'].diff().dt.days.dropna()
            rata_interval_hari = intervals.mean()
        else:
            rata_interval_hari = 0

        if 'Risk_Score' in warga_hist.columns:
            jumlah_terlambat = (warga_hist['Risk_Score'] > 50).sum()
            persentase_terlambat = jumlah_terlambat / total_transaksi
        else:
            jumlah_terlambat = 0
            persentase_terlambat = 0

        if 'Jenis_Transaksi' in warga_hist.columns:
            total_topup = (warga_hist['Jenis_Transaksi'] == 'TopUp').sum()
            total_qris = (warga_hist['Jenis_Transaksi'] == 'QRIS').sum()
            total_transfer = (warga_hist['Jenis_Transaksi'] == 'Transfer').sum()

            prop_topup = total_topup / total_transaksi
            prop_qris = total_qris / total_transaksi
            prop_transfer = total_transfer / total_transaksi
        else:
            prop_topup = prop_qris = prop_transfer = 0.33
    else:
        total_transaksi = 0
        rata_nominal = nominal_transaksi
        frekuensi_per_hari = 0
        durasi_aktif_hari = 0
        rata_interval_hari = 0
        jumlah_terlambat = 0
        persentase_terlambat = 0.3
        prop_topup = prop_qris = prop_transfer = 0.33

    # Transaction Type Features
    is_topup = 1 if jenis_transaksi == "TopUp" else 0
    is_qris = 1 if jenis_transaksi == "QRIS" else 0
    is_transfer = 1 if jenis_transaksi == "Transfer" else 0

    # Activity Features
    bulan_ini = warga_hist[warga_hist['Timestamp'].dt.month == bulan]
    aktivitas_bulan_ini = len(bulan_ini)

    quarter_ini = warga_hist[warga_hist['Timestamp'].dt.quarter == quarter]
    aktivitas_quarter_ini = len(quarter_ini)

    # Warga ID
    warga_id = warga_id_map.get(nama_warga, -1)

    # Susun fitur final
    final_features = {
        'Bulan': bulan,
        'Hari': hari,
        'Hari_Minggu': hari_minggu,
        'Quarter': quarter,
        'Is_Weekend': is_weekend,
        'Is_Akhir_Bulan': is_akhir_bulan,
        'Is_Awal_Bulan': is_awal_bulan,
        'Hari_Dari_Awal_Bulan': hari_dari_awal_bulan,
        'Total_Transaksi': total_transaksi,
        'Rata_Nominal': rata_nominal,
        'Frekuensi_Per_Hari': frekuensi_per_hari,
        'Durasi_Aktif_Hari': durasi_aktif_hari,
        'Rata_Interval_Hari': rata_interval_hari,
        'Jumlah_Terlambat': jumlah_terlambat,
        'Persentase_Terlambat': persentase_terlambat,
        'Is_TopUp': is_topup,
        'Is_QRIS': is_qris,
        'Is_Transfer': is_transfer,
        'Prop_TopUp': prop_topup,
        'Prop_QRIS': prop_qris,
        'Prop_Transfer': prop_transfer,
        'Aktivitas_Bulan_Ini': aktivitas_bulan_ini,
        'Aktivitas_Quarter_Ini': aktivitas_quarter_ini,
        'Nominal_Transaksi': nominal_transaksi,
        'Warga_ID': warga_id
    }

    df_final = pd.DataFrame([final_features])
    df_final = df_final[feature_columns]

    return df_final

# =====================================================
# 4. PREDICT
# =====================================================

def predict_risk(X_input):
    """
    Prediksi menggunakan stacking ensemble untuk klasifikasi
    """
    xgb_prob = xgb_model.predict_proba(X_input)
    cat_prob = cat_model.predict_proba(X_input)
    lgb_prob = lgb_model.predict_proba(X_input)

    meta_feats = np.hstack([xgb_prob, cat_prob, lgb_prob])

    final_pred = meta_model.predict(meta_feats)[0]
    final_prob = meta_model.predict_proba(meta_feats)

    kategori = encoder_target.inverse_transform([final_pred])[0]
    confidence = round(final_prob.max() * 100, 2)

    if kategori == "Tepat Waktu":
        risk_score = round((1 - final_prob[0, final_pred]) * 30, 2)
    elif kategori == "Agak Telat":
        risk_score = round(30 + final_prob[0, final_pred] * 30, 2)
    else:
        risk_score = round(60 + final_prob[0, final_pred] * 40, 2)

    return risk_score, kategori, confidence

# =====================================================
# 5. ANALISIS RISIKO AGREGAT (MAIN FEATURE)
# =====================================================

def analisis_risiko_tagihan():
    """
    Analisis risiko keterlambatan pembayaran secara agregat
    Tidak menampilkan identitas warga, fokus pada statistik dan insight
    """
    print("\n" + "="*70)
    print(" " * 15 + "ANALISIS RISIKO KETERLAMBATAN PEMBAYARAN")
    print("="*70)

    # Input tanggal transaksi
    while True:
        tanggal_input = input("\nüìÖ Tanggal Tagihan (YYYY-MM-DD) atau 'today': ")

        if tanggal_input.lower() == 'today':
            tanggal_transaksi = datetime.now()
            break
        else:
            try:
                tanggal_transaksi = datetime.strptime(tanggal_input, '%Y-%m-%d')
                break
            except ValueError:
                print("‚ùå Format salah! Gunakan YYYY-MM-DD (contoh: 2025-01-15)")
                continue

    # Input nominal
    while True:
        try:
            nominal = float(input("üí∞ Nominal Iuran per Warga (Rp): "))
            if nominal > 0:
                break
            else:
                print("‚ùå Nominal harus lebih dari 0!")
        except ValueError:
            print("‚ùå Input tidak valid! Masukkan angka.")

    # Pilih jenis transaksi
    print("\nüì± Jenis Transaksi:")
    print("   1. TopUp")
    print("   2. QRIS")
    print("   3. Transfer")

    while True:
        try:
            jenis_idx = int(input("   Pilih (1/2/3): "))
            if 1 <= jenis_idx <= 3:
                jenis_transaksi = ["TopUp", "QRIS", "Transfer"][jenis_idx - 1]
                break
            else:
                print("‚ùå Pilih antara 1, 2, atau 3!")
        except ValueError:
            print("‚ùå Input tidak valid!")

    # Proses prediksi untuk semua warga
    print(f"\n‚è≥ Menganalisis data {len(warga_mapping)} warga...")

    risk_scores = []
    kategori_list = []

    for _, row in warga_mapping.iterrows():
        nama_warga = row['Nama_Pengirim']

        try:
            X = create_features_for_inference(
                tanggal_transaksi,
                nominal,
                jenis_transaksi,
                nama_warga
            )

            score, kategori, confidence = predict_risk(X)
            risk_scores.append(score)
            kategori_list.append(kategori)

        except Exception as e:
            continue

    # Hitung statistik agregat
    risk_scores = np.array(risk_scores)

    avg_risk = np.mean(risk_scores)
    median_risk = np.median(risk_scores)
    std_risk = np.std(risk_scores)

    # Hitung distribusi kategori
    kategori_counts = pd.Series(kategori_list).value_counts()
    total_warga = len(risk_scores)

    # Hitung persentase per kategori
    tepat_waktu = kategori_counts.get("Tepat Waktu", 0)
    agak_telat = kategori_counts.get("Agak Telat", 0)
    sangat_telat = kategori_counts.get("Sangat Telat", 0)

    pct_tepat = (tepat_waktu / total_warga) * 100
    pct_agak = (agak_telat / total_warga) * 100
    pct_sangat = (sangat_telat / total_warga) * 100

    # Tentukan tingkat risiko keseluruhan
    if avg_risk < 30:
        tingkat_risiko = "RENDAH"
        emoji_risiko = "üü¢"
    elif avg_risk < 60:
        tingkat_risiko = "SEDANG"
        emoji_risiko = "üü°"
    else:
        tingkat_risiko = "TINGGI"
        emoji_risiko = "üî¥"

    # Tampilkan hasil
    print("\n" + "="*70)
    print(" " * 25 + "HASIL ANALISIS")
    print("="*70)

    print(f"\nüìã INFORMASI TAGIHAN:")
    print(f"   Tanggal         : {tanggal_transaksi.strftime('%d %B %Y')}")
    print(f"   Nominal         : Rp {nominal:,.0f}")
    print(f"   Jenis Transaksi : {jenis_transaksi}")
    print(f"   Total Warga     : {total_warga} orang")

    print(f"\nüéØ TINGKAT RISIKO KETERLAMBATAN: {emoji_risiko} {tingkat_risiko}")
    print(f"   Skor Risiko Rata-rata: {avg_risk:.1f}%")

    print(f"\nüìä STATISTIK RISIKO:")
    print(f"   ‚îú‚îÄ Rata-rata  : {avg_risk:.2f}%")
    print(f"   ‚îú‚îÄ Median     : {median_risk:.2f}%")
    print(f"   ‚îú‚îÄ Std Dev    : {std_risk:.2f}%")
    print(f"   ‚îú‚îÄ Minimum    : {risk_scores.min():.2f}%")
    print(f"   ‚îî‚îÄ Maximum    : {risk_scores.max():.2f}%")

    print(f"\nüìà DISTRIBUSI KATEGORI:")
    print(f"   üü¢ Tepat Waktu   : {tepat_waktu:3d} warga ({pct_tepat:5.1f}%)")
    print(f"   üü° Agak Telat    : {agak_telat:3d} warga ({pct_agak:5.1f}%)")
    print(f"   üî¥ Sangat Telat  : {sangat_telat:3d} warga ({pct_sangat:5.1f}%)")

    # Estimasi potensi keterlambatan
    potensi_terlambat = agak_telat + sangat_telat
    pct_potensi = (potensi_terlambat / total_warga) * 100
    nilai_berisiko = potensi_terlambat * nominal

    print(f"\nüí° INSIGHT:")
    print(f"   ‚Ä¢ Potensi Keterlambatan: {potensi_terlambat} warga ({pct_potensi:.1f}%)")
    print(f"   ‚Ä¢ Nilai Berisiko: Rp {nilai_berisiko:,.0f}")

    # Perbandingan dengan bulan sebelumnya (jika ada data)
    bulan_lalu = tanggal_transaksi - timedelta(days=30)
    hist_bulan_lalu = df_history[
        (df_history['Timestamp'] >= bulan_lalu) &
        (df_history['Timestamp'] < tanggal_transaksi)
    ]

    if len(hist_bulan_lalu) > 0 and 'Risk_Score' in hist_bulan_lalu.columns:
        avg_risk_bulan_lalu = hist_bulan_lalu['Risk_Score'].mean()
        perubahan = avg_risk - avg_risk_bulan_lalu

        if perubahan > 0:
            trend = f"‚Üó Naik {abs(perubahan):.1f}%"
            emoji_trend = "‚ö†Ô∏è"
        elif perubahan < 0:
            trend = f"‚Üò Turun {abs(perubahan):.1f}%"
            emoji_trend = "‚úÖ"
        else:
            trend = "‚Üí Stabil"
            emoji_trend = "‚û°Ô∏è"

        print(f"   ‚Ä¢ Trend vs Bulan Lalu: {emoji_trend} {trend}")

    # Rekomendasi
    print(f"\nüíº REKOMENDASI STRATEGI:")

    if avg_risk >= 60:
        print(f"   üî¥ PRIORITAS TINGGI:")
        print(f"      ‚Ä¢ Lakukan reminder intensif mulai dari sekarang")
        print(f"      ‚Ä¢ Siapkan tim follow-up untuk warga berisiko tinggi")
        print(f"      ‚Ä¢ Pertimbangkan insentif pembayaran awal")
        print(f"      ‚Ä¢ Aktifkan sistem reminder otomatis harian")

    elif avg_risk >= 30:
        print(f"   üü° PRIORITAS SEDANG:")
        print(f"      ‚Ä¢ Kirim reminder rutin (3-5 hari sekali)")
        print(f"      ‚Ä¢ Monitor perkembangan pembayaran secara berkala")
        print(f"      ‚Ä¢ Siapkan daftar warga yang perlu di-follow up")

    else:
        print(f"   üü¢ PRIORITAS RENDAH:")
        print(f"      ‚Ä¢ Reminder standar sudah cukup")
        print(f"      ‚Ä¢ Tingkat kepatuhan pembayaran baik")
        print(f"      ‚Ä¢ Pertahankan komunikasi yang baik dengan warga")

    # Estimasi waktu optimal reminder
    print(f"\n‚è∞ WAKTU OPTIMAL REMINDER:")
    if is_akhir_bulan := (tanggal_transaksi.day >= 25):
        print(f"   ‚Ä¢ Mulai reminder: SEGERA (akhir bulan)")
    elif is_awal_bulan := (tanggal_transaksi.day <= 5):
        print(f"   ‚Ä¢ Mulai reminder: 2-3 hari sebelum deadline")
    else:
        hari_optimal = max(tanggal_transaksi.day - 7, 1)
        print(f"   ‚Ä¢ Mulai reminder: Tanggal {hari_optimal} (H-7)")

    print(f"   ‚Ä¢ Frekuensi: Setiap 2-3 hari")
    print(f"   ‚Ä¢ Intensif: 2 hari sebelum deadline")

    print("\n" + "="*70)

    # Export option
    export = input("\nüíæ Simpan laporan detail ke CSV? (y/n): ")
    if export.lower() == 'y':
        # Buat DataFrame hasil (tanpa identitas, hanya statistik)
        df_hasil = pd.DataFrame({
            'Tanggal_Analisis': [tanggal_transaksi.strftime('%Y-%m-%d')] * total_warga,
            'Risk_Score': risk_scores,
            'Kategori': kategori_list
        })

        filename = f"analisis_risiko_{tanggal_transaksi.strftime('%Y%m%d')}.csv"
        df_hasil.to_csv(filename, index=False)
        print(f"‚úÖ Laporan disimpan ke: {filename}")

    return {
        'tanggal': tanggal_transaksi,
        'nominal': nominal,
        'jenis_transaksi': jenis_transaksi,
        'total_warga': total_warga,
        'avg_risk': avg_risk,
        'tingkat_risiko': tingkat_risiko,
        'distribusi': {
            'tepat_waktu': tepat_waktu,
            'agak_telat': agak_telat,
            'sangat_telat': sangat_telat
        }
    }

# =====================================================
# RUN MAIN PROGRAM
# =====================================================

if __name__ == "__main__":
    print("\n" + "="*70)
    print(" " * 10 + "SISTEM ANALISIS RISIKO KETERLAMBATAN PEMBAYARAN")
    print("="*70)

    while True:
        print("\n" + "‚îÄ"*70)
        input("Tekan ENTER untuk memulai analisis...")

        hasil = analisis_risiko_tagihan()

        lanjut = input("\nüîÑ Lakukan analisis lagi? (y/n): ")
        if lanjut.lower() != 'y':
            print("\n‚úÖ Terima kasih! Program selesai.")
            print("="*70)
            break

‚úÖ Models & metadata loaded!
   - Total features: 25
   - Target classes: ['Mendekati_Deadline' 'Tepat_Waktu' 'Terlambat']
‚úÖ Warga mapping loaded!
‚úÖ Historical data & mappings loaded!


          SISTEM ANALISIS RISIKO KETERLAMBATAN PEMBAYARAN

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Tekan ENTER untuk memulai analisis...

               ANALISIS RISIKO KETERLAMBATAN PEMBAYARAN

üìÖ Tanggal Tagihan (YYYY-MM-DD) atau 'today': 2025
‚ùå Format salah! Gunakan YYYY-MM-DD (contoh: 2025-01-15)

üìÖ Tanggal Tagihan (YYYY-MM-DD) atau 'today': 2025-12-9
üí∞ Nominal Iuran per Warga (Rp): 25000

üì± Jenis Transaksi:
   1. TopUp
   2. QRIS
   3. Transfer
   Pilih (1/2/3): 3

‚è≥ Menganalisis data 156 warga...

                         HASIL ANALISIS

üìã INFORMASI TAGIHAN:
   Tanggal         : 09 December 2025
   Nominal 