In [None]:
# ===========================
# 1. IMPORT LIBRARIES
# ===========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Model Klasifikasi
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC

# Evaluasi Model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

# Set style untuk visualisasi
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úì Library import successful!")


In [None]:
# ===========================
# UTILITIES: EVALUATION FUNCTION
# ===========================

def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n{model_name}")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    cm = confusion_matrix(y_test, y_pred)
    print(f"  Confusion Matrix:\n{cm}")

    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion_Matrix': cm
    }


In [None]:
# ===========================
# A.1 LOAD DATA DAN EKSPLORASI AWAL
# ===========================

# Load dataset dari Kaggle
import kagglehub
path = kagglehub.dataset_download("blastchar/telco-customer-churn")
csv_path = f"{path}/WA_Fn-UseC_-Telco-Customer-Churn.csv"

# Baca dataset
df = pd.read_csv(csv_path)

print("=" * 80)
print("EKSPLORASI AWAL DATASET")
print("=" * 80)

# 1. Tampilkan 5 baris pertama
print("\n1Ô∏è‚É£ Lima Baris Pertama Dataset:")
print(df.head())

# 2. Info Dataset
print("\n2Ô∏è‚É£ Informasi Dataset:")
print(df.info())

# 3. Statistik Deskriptif
print("\n3Ô∏è‚É£ Statistik Deskriptif:")
print(df.describe())

print(f"\nüìä Dimensi Dataset: {df.shape[0]} baris, {df.shape[1]} kolom")


In [None]:
# ===========================
# A.2 IDENTIFIKASI MISSING VALUE
# ===========================

print("\n" + "=" * 80)
print("IDENTIFIKASI MISSING VALUE")
print("=" * 80)

# Hitung missing value
missing_data = pd.DataFrame({
    'Kolom': df.columns,
    'Missing_Count': df.isnull().sum().values,
    'Missing_Percentage': (df.isnull().sum().values / len(df) * 100).round(2)
})
missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

if len(missing_data) == 0:
    print("\n‚úì Tidak ada missing value dalam dataset!")
else:
    print("\nMissing Value Ditemukan:")
    print(missing_data)
    
    # Visualisasi missing value
    fig, ax = plt.subplots(figsize=(10, 6))
    missing_data_plot = df.isnull().sum()
    missing_data_plot = missing_data_plot[missing_data_plot > 0].sort_values(ascending=False)
    missing_data_plot.plot(kind='barh', ax=ax, color='coral')
    ax.set_xlabel('Jumlah Missing Value')
    ax.set_title('Distribusi Missing Value per Kolom')
    plt.tight_layout()
    plt.show()


In [None]:
# ===========================
# A.2B IDENTIFIKASI DAN HAPUS DUPLICATE DATA
# ===========================

print("\n" + "=" * 80)
print("IDENTIFIKASI DAN HAPUS DUPLICATE DATA")
print("=" * 80)

# Cek duplicate berdasarkan semua kolom
duplicate_all = df.duplicated().sum()
print(f"\n1Ô∏è‚É£ Duplikat (semua kolom): {duplicate_all} baris")

# Cek duplicate berdasarkan customerID (unique identifier)
if 'customerID' in df.columns:
    duplicate_id = df.duplicated(subset=['customerID'], keep=False).sum()
    print(f"2Ô∏è‚É£ Duplikat berdasarkan customerID: {duplicate_id} baris")
    
    if duplicate_id > 0:
        print("\n   Menampilkan duplikat customerID:")
        dup_customers = df[df.duplicated(subset=['customerID'], keep=False)].sort_values('customerID')
        print(dup_customers[['customerID', 'tenure', 'MonthlyCharges']].head(10))

# Hapus duplikat
print("\n3Ô∏è‚É£ Menghapus duplikat...")
df_before = len(df)
df = df.drop_duplicates()
df_after = len(df)
rows_removed = df_before - df_after

print(f"   Baris sebelum: {df_before}")
print(f"   Baris sesudah: {df_after}")
print(f"   ‚úì Duplikat yang dihapus: {rows_removed} baris")

# Jika ada duplikat customerID setelah drop_duplicates, hapus berdasarkan ID
if 'customerID' in df.columns:
    df = df.drop_duplicates(subset=['customerID'], keep='first')
    print(f"   ‚úì Dataset setelah drop duplikat customerID: {len(df)} baris")

print(f"\n‚úì Dataset cleaning complete! Shape: {df.shape}")


In [None]:
# ===========================
# A.2C PEMBERSIHAN OUTLIER FITUR NUMERIK
# ===========================

print("\n" + "=" * 80)
print("PEMBERSIHAN OUTLIER FITUR NUMERIK")
print("=" * 80)

# Pilih fitur numerik (kecuali target dan ID)
numeric_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col not in ['Churn']]

# Z-score method untuk deteksi outlier
from scipy.stats import zscore

outlier_indices = set()
z_thresh = 3  # threshold z-score

for col in numeric_cols:
    z_scores = zscore(df[col])
    outliers = np.where(np.abs(z_scores) > z_thresh)[0]
    if len(outliers) > 0:
        print(f"Fitur {col}: {len(outliers)} outlier")
        outlier_indices.update(outliers.tolist())

print(f"Total baris outlier yang terdeteksi: {len(outlier_indices)}")

# Hapus baris outlier
df_before = len(df)
df = df.drop(index=list(outlier_indices)).reset_index(drop=True)
df_after = len(df)
print(f"Baris sebelum: {df_before}")
print(f"Baris sesudah: {df_after}")
print(f"‚úì Outlier berhasil dibersihkan!")

# Cek ulang shape dataset
print(f"\nShape dataset setelah cleaning outlier: {df.shape}")

In [None]:
# ===========================
# A.3 VISUALISASI DISTRIBUSI TARGET (CHURN)
# ===========================

print("\n" + "=" * 80)
print("ANALISIS VARIABEL TARGET (CHURN)")
print("=" * 80)

# Distribusi Churn
churn_counts = df['Churn'].value_counts()
churn_percentage = df['Churn'].value_counts(normalize=True) * 100

print("\nDistribusi Target Churn:")
print(f"  No:  {churn_counts['No']} ({churn_percentage['No']:.2f}%)")
print(f"  Yes: {churn_counts['Yes']} ({churn_percentage['Yes']:.2f}%)")

# Visualisasi
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
churn_counts.plot(kind='bar', ax=axes[0], color=['green', 'red'], alpha=0.7)
axes[0].set_title('Distribusi Churn (Count)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Jumlah Pelanggan')
axes[0].set_xlabel('Churn Status')
axes[0].set_xticklabels(['No', 'Yes'], rotation=0)

# Pie chart
colors = ['#2ecc71', '#e74c3c']
axes[1].pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%', 
            colors=colors, startangle=90)
axes[1].set_title('Proporsi Churn', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

# Cek class imbalance
imbalance_ratio = churn_counts['Yes'] / churn_counts['No']
print(f"\n‚ö†Ô∏è Class Imbalance Ratio: {imbalance_ratio:.3f}")
if imbalance_ratio < 0.3:
    print("   Status: Dataset memiliki imbalance yang signifikan")
else:
    print("   Status: Dataset cukup seimbang")


In [None]:
# ===========================
# A.4B VERSI DATASET: NORMAL, UNDERSAMPLING, OVERSAMPLING
# ===========================

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

print("\n" + "=" * 80)
print("MEMBUAT VERSI DATASET: NORMAL, UNDERSAMPLING, OVERSAMPLING")
print("=" * 80)

# 1. Dataset Normal (tanpa sampling)
df_normal = df.copy()
print(f"\n1Ô∏è‚É£ Dataset Normal: {df_normal.shape}")
print(f"   Churn distribution:\n{df_normal['Churn'].value_counts()}")

# Untuk sampling, perlu encode categorical terlebih dahulu
df_temp = df_normal.copy()
y_temp = df_temp['Churn']
X_temp = df_temp.drop(['Churn', 'customerID'], axis=1, errors='ignore')

# Simple label encoding untuk categorical
X_encoded = X_temp.copy()
for col in X_encoded.columns:
    if X_encoded[col].dtype == 'object':
        X_encoded[col] = LabelEncoder().fit_transform(X_encoded[col])

# Handle TotalCharges jika string
if 'TotalCharges' in X_encoded.columns and X_encoded['TotalCharges'].dtype == 'object':
    X_encoded['TotalCharges'] = pd.to_numeric(X_encoded['TotalCharges'], errors='coerce')
    X_encoded['TotalCharges'].fillna(X_encoded['TotalCharges'].median(), inplace=True)

# 2. Dataset Undersampling
print("\n2Ô∏è‚É£ Undersampling...")
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_encoded, y_temp)
df_undersample = pd.concat([X_rus.reset_index(drop=True), y_rus.reset_index(drop=True)], axis=1)
print(f"   Shape: {df_undersample.shape}")
print(f"   Churn distribution:\n{df_undersample['Churn'].value_counts()}")

# 3. Dataset Oversampling (SMOTE)
print("\n3Ô∏è‚É£ Oversampling (SMOTE)...")
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_encoded, y_temp)
df_oversample = pd.concat([X_smote.reset_index(drop=True), y_smote.reset_index(drop=True)], axis=1)
print(f"   Shape: {df_oversample.shape}")
print(f"   Churn distribution:\n{df_oversample['Churn'].value_counts()}")

print("\n" + "=" * 80)
print("‚úì Semua versi dataset siap!")
print("=" * 80)
print(f"Normal:        {df_normal.shape[0]:,} rows")
print(f"Undersample:   {df_undersample.shape[0]:,} rows")
print(f"Oversample:    {df_oversample.shape[0]:,} rows")


In [None]:
# ===========================
# A.4 ANALISIS KORELASI
# ===========================

print("\n" + "=" * 80)
print("ANALISIS KORELASI FITUR NUMERIK")
print("=" * 80)

# Pilih fitur numerik
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nFitur Numerik yang Ditemukan: {numeric_features}")

# Buat correlation matrix
correlation_matrix = df[numeric_features].corr()

# Visualisasi heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Heatmap Korelasi Fitur Numerik', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Tampilkan korelasi dengan target jika numeric
if 'Churn' in df.columns:
    # Encode target untuk analisis korelasi
    df_temp = df.copy()
    df_temp['Churn_encoded'] = (df_temp['Churn'] == 'Yes').astype(int)
    
    target_correlation = df_temp[numeric_features + ['Churn_encoded']].corr()['Churn_encoded'].sort_values(ascending=False)
    print("\nKorelasi dengan Churn (Target):")
    print(target_correlation)


In [None]:
# ===========================
# B.4 EVALUASI MODEL UNTUK SEMUA TIPE DATASET
# ===========================

# Helper untuk encode dan split dataset balancing
from sklearn.utils import shuffle

def prepare_balanced_data(df_bal):
    df_bal = shuffle(df_bal, random_state=42).reset_index(drop=True)
    y = (df_bal['Churn'] == 'Yes').astype(int) if df_bal['Churn'].dtype == 'object' else df_bal['Churn']
    X = df_bal.drop(['Churn', 'customerID'], axis=1, errors='ignore')
    # Simple encoding categorical
    X_encoded = X.copy()
    for col in X_encoded.columns:
        if X_encoded[col].dtype == 'object':
            X_encoded[col] = LabelEncoder().fit_transform(X_encoded[col])
    # Handle TotalCharges
    if 'TotalCharges' in X_encoded.columns and X_encoded['TotalCharges'].dtype == 'object':
        X_encoded['TotalCharges'] = pd.to_numeric(X_encoded['TotalCharges'], errors='coerce')
        X_encoded['TotalCharges'].fillna(X_encoded['TotalCharges'].median(), inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y)
    return X_train, X_test, y_train, y_test

# List dataset balancing
datasets = [
    ("Normal", df_normal),
    ("Undersampling", df_undersample),
    ("Oversampling", df_oversample)
]

results_balanced = []
for ds_name, ds in datasets:
    X_train, X_test, y_train, y_test = prepare_balanced_data(ds)
    # Logistic Regression
    lr = LogisticRegression(random_state=42, max_iter=1000)
    lr.fit(X_train, y_train)
    res_lr = evaluate_model(lr, X_test, y_test, f"Logistic Regression (Direct) ({ds_name})")
    results_balanced.append(res_lr)
    # Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    res_rf = evaluate_model(rf, X_test, y_test, f"Random Forest (Direct) ({ds_name})")
    results_balanced.append(res_rf)
    # Voting Classifier
    voting = VotingClassifier([
        ('lr', LogisticRegression(random_state=42, max_iter=1000)),
        ('svm', SVC(kernel='rbf', random_state=42, probability=True)),
        ('knn', KNeighborsClassifier(n_neighbors=5))
    ], voting='soft')
    voting.fit(X_train, y_train)
    res_voting = evaluate_model(voting, X_test, y_test, f"Voting Classifier (Direct) ({ds_name})")
    results_balanced.append(res_voting)

# Gabungkan hasil
all_results_balanced = pd.DataFrame(results_balanced)
print("\n\nüìä RINGKASAN HASIL SEMUA DATASET BALANCING:")
print(all_results_balanced[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

In [None]:
# ===========================
# B.5 EVALUASI MODEL PREPROCESSING & TUNED UNTUK SEMUA TIPE DATASET
# ===========================

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Hyperparameter grids
lr_params = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

results_all = []
for ds_name, ds in datasets:
    # --- Preprocessing ---
    df_prep = ds.copy()
    y = (df_prep['Churn'] == 'Yes').astype(int) if df_prep['Churn'].dtype == 'object' else df_prep['Churn']
    X = df_prep.drop(['Churn', 'customerID'], axis=1, errors='ignore')
    # Identify categorical columns
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if 'TotalCharges' in X.columns and X['TotalCharges'].dtype == 'object':
        X['TotalCharges'] = pd.to_numeric(X['TotalCharges'], errors='coerce')
        X['TotalCharges'].fillna(X['TotalCharges'].median(), inplace=True)
        num_cols.append('TotalCharges')
        if 'TotalCharges' in cat_cols:
            cat_cols.remove('TotalCharges')
    # Preprocessing pipeline
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), cat_cols)
    ])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Logistic Regression (Preprocessing)
    lr_pipe = make_pipeline(preprocessor, LogisticRegression(random_state=42, max_iter=1000))
    lr_pipe.fit(X_train, y_train)
    res_lr_prep = evaluate_model(lr_pipe, X_test, y_test, f"Logistic Regression (Preprocessing) ({ds_name})")
    results_all.append(res_lr_prep)
    # Random Forest (Preprocessing)
    rf_pipe = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))
    rf_pipe.fit(X_train, y_train)
    res_rf_prep = evaluate_model(rf_pipe, X_test, y_test, f"Random Forest (Preprocessing) ({ds_name})")
    results_all.append(res_rf_prep)
    # Voting Classifier (Preprocessing)
    voting_pipe = make_pipeline(preprocessor, VotingClassifier([
        ('lr', LogisticRegression(random_state=42, max_iter=1000)),
        ('svm', SVC(kernel='rbf', random_state=42, probability=True)),
        ('knn', KNeighborsClassifier(n_neighbors=5))
    ], voting='soft'))
    voting_pipe.fit(X_train, y_train)
    res_voting_prep = evaluate_model(voting_pipe, X_test, y_test, f"Voting Classifier (Preprocessing) ({ds_name})")
    results_all.append(res_voting_prep)

    # --- Tuned ---
    # Logistic Regression (Tuned)
    lr_grid = make_pipeline(preprocessor, GridSearchCV(LogisticRegression(random_state=42, max_iter=1000), lr_params, cv=5, scoring='f1', n_jobs=-1))
    lr_grid.fit(X_train, y_train)
    best_lr = lr_grid.named_steps['gridsearchcv'].best_estimator_
    tuned_lr_pipe = make_pipeline(preprocessor, best_lr)
    tuned_lr_pipe.fit(X_train, y_train)
    res_lr_tuned = evaluate_model(tuned_lr_pipe, X_test, y_test, f"Logistic Regression (Tuned) ({ds_name})")
    results_all.append(res_lr_tuned)
    # Random Forest (Tuned)
    rf_grid = make_pipeline(preprocessor, GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='f1', n_jobs=-1))
    rf_grid.fit(X_train, y_train)
    best_rf = rf_grid.named_steps['gridsearchcv'].best_estimator_
    tuned_rf_pipe = make_pipeline(preprocessor, best_rf)
    tuned_rf_pipe.fit(X_train, y_train)
    res_rf_tuned = evaluate_model(tuned_rf_pipe, X_test, y_test, f"Random Forest (Tuned) ({ds_name})")
    results_all.append(res_rf_tuned)
    # Voting Classifier (Tuned) (base estimators tuned)
    voting_tuned = VotingClassifier([
        ('lr', LogisticRegression(C=1, random_state=42, max_iter=1000)),
        ('svm', SVC(kernel='rbf', C=1, gamma='scale', random_state=42, probability=True)),
        ('knn', KNeighborsClassifier(n_neighbors=7))
    ], voting='soft')
    voting_tuned_pipe = make_pipeline(preprocessor, voting_tuned)
    voting_tuned_pipe.fit(X_train, y_train)
    res_voting_tuned = evaluate_model(voting_tuned_pipe, X_test, y_test, f"Voting Classifier (Tuned) ({ds_name})")
    results_all.append(res_voting_tuned)

# Gabungkan semua hasil
all_results_full = pd.concat([all_results_balanced, pd.DataFrame(results_all)], ignore_index=True)
print("\n\nüìä RINGKASAN HASIL SEMUA MODEL DAN DATASET:")
print(all_results_full[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

In [None]:
# ===========================
# C.1 DATA PREPROCESSING
# ===========================

print("\n" + "=" * 80)
print("PREPROCESSING DATA")
print("=" * 80)

df_prep = df.copy()

# 1. Handle TotalCharges (convert to numeric)
print("\n1Ô∏è‚É£ Handling TotalCharges...")
df_prep['TotalCharges'] = pd.to_numeric(df_prep['TotalCharges'], errors='coerce')
df_prep['TotalCharges'].fillna(df_prep['TotalCharges'].median(), inplace=True)
print("   ‚úì TotalCharges converted to numeric")

# 2. Drop tidak relevan columns
print("\n2Ô∏è‚É£ Dropping irrelevant columns...")
df_prep = df_prep.drop(['customerID'], axis=1)
print("   ‚úì customerID dropped")

# 3. Encode target
print("\n3Ô∏è‚É£ Encoding target variable...")
df_prep['Churn'] = (df_prep['Churn'] == 'Yes').astype(int)
print("   ‚úì Churn encoded")

# 4. Pisahkan numeric dan categorical
print("\n4Ô∏è‚É£ Separating numeric and categorical features...")
numeric_cols = df_prep.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df_prep.select_dtypes(include=['object']).columns.tolist()

print(f"   Numeric columns: {numeric_cols}")
print(f"   Categorical columns: {categorical_cols}")

# 5. One-Hot Encoding untuk categorical
print("\n5Ô∏è‚É£ Applying One-Hot Encoding...")
df_prep_encoded = pd.get_dummies(df_prep, columns=categorical_cols, drop_first=True)
print(f"   ‚úì Features after encoding: {df_prep_encoded.shape[1]}")

# 6. Pisahkan X dan y
y_prep = df_prep_encoded['Churn']
X_prep = df_prep_encoded.drop(['Churn'], axis=1)

# 7. Scaling
print("\n6Ô∏è‚É£ Feature Scaling (StandardScaler)...")
scaler = StandardScaler()
X_prep_scaled = scaler.fit_transform(X_prep)
X_prep_scaled = pd.DataFrame(X_prep_scaled, columns=X_prep.columns)
print("   ‚úì Features scaled")

# Train-test split
X_train_prep, X_test_prep, y_train_prep, y_test_prep = train_test_split(
    X_prep_scaled, y_prep, test_size=0.2, random_state=42, stratify=y_prep
)

print(f"\n‚úì Preprocessing complete!")
print(f"  Training set: {X_train_prep.shape}")
print(f"  Testing set: {X_test_prep.shape}")
print(f"  Total features: {X_train_prep.shape[1]}")


In [None]:
# ===========================
# C.2 TRAINING MODELS WITH PREPROCESSING (INTEGRATED)
# ===========================

print("\n" + "=" * 80)
print("TRAINING MODELS WITH PREPROCESSING (DIINTEGRASIKAN DI EVALUASI LENGKAP)")
print("=" * 80)

print("\nTraining untuk Logistic Regression, Random Forest, dan Voting Classifier")
print("dilakukan per-dataset (Normal/Undersampling/Oversampling) di bagian B.5")
print("menggunakan pipeline (ColumnTransformer + StandardScaler + OneHotEncoder).")
print("Cell ini tidak melakukan training langsung untuk menghindari duplikasi.")

print("\nReferensi konfigurasi base models:")
print(" - Logistic Regression: random_state=42, max_iter=1000")
print(" - Random Forest: n_estimators=100, random_state=42")
print(" - Voting Classifier: LR + SVM(rbf) + KNN, voting='soft'")

print("\n‚úì Silakan lihat B.5 untuk proses training lintas dataset.")


In [None]:
# ===========================
# C.3 EVALUASI MODEL (PREPROCESSING - SEMUA DATASET)
# ===========================

print("\n" + "=" * 80)
print("HASIL EVALUASI - PREPROCESSING ACROSS ALL DATASETS")
print("=" * 80)

# Gunakan hasil lengkap jika tersedia; kalau tidak, hitung bagian preprocessing saja
try:
    _ = all_results_full
    prep_results = all_results_full[all_results_full['Model'].str.contains('(Preprocessing)', regex=False)].copy()
    print("\n‚úì Menggunakan hasil preprocessing dari evaluasi lengkap (27 kombinasi).")
except NameError:
    print("\n‚ö†Ô∏è all_results_full belum tersedia; menghitung ulang hasil preprocessing untuk semua dataset...")
    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler
    from sklearn.pipeline import make_pipeline

    # Siapkan dataset jika belum ada
    try:
        _ = df_normal
    except NameError:
        df_normal = df.copy()
    try:
        _ = df_undersample
        _ = df_oversample
    except NameError:
        rus = RandomUnderSampler(random_state=42)
        X_rus, y_rus = rus.fit_resample(df_normal.drop('Churn', axis=1), df_normal['Churn'])
        df_undersample = pd.concat([X_rus, y_rus], axis=1)
        smote = SMOTE(random_state=42)
        X_smote, y_smote = smote.fit_resample(df_normal.drop('Churn', axis=1), df_normal['Churn'])
        df_oversample = pd.concat([X_smote, y_smote], axis=1)

    datasets = [
        ("Normal", df_normal),
        ("Undersampling", df_undersample),
        ("Oversampling", df_oversample)
    ]

    prep_rows = []
    for ds_name, ds in datasets:
        df_prep = ds.copy()
        y = (df_prep['Churn'] == 'Yes').astype(int) if df_prep['Churn'].dtype == 'object' else df_prep['Churn']
        X = df_prep.drop(['Churn', 'customerID'], axis=1, errors='ignore')

        cat_cols = X.select_dtypes(include=['object']).columns.tolist()
        num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
        if 'TotalCharges' in X.columns and X['TotalCharges'].dtype == 'object':
            X['TotalCharges'] = pd.to_numeric(X['TotalCharges'], errors='coerce')
            X['TotalCharges'].fillna(X['TotalCharges'].median(), inplace=True)
            if 'TotalCharges' not in num_cols:
                num_cols.append('TotalCharges')
            if 'TotalCharges' in cat_cols:
                cat_cols.remove('TotalCharges')

        preprocessor = ColumnTransformer([
            ('num', StandardScaler(), num_cols),
            ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), cat_cols)
        ])

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Logistic Regression (Preprocessing)
        lr_pipe = make_pipeline(preprocessor, LogisticRegression(random_state=42, max_iter=1000))
        lr_pipe.fit(X_train, y_train)
        prep_rows.append(evaluate_model(lr_pipe, X_test, y_test, f"Logistic Regression (Preprocessing) ({ds_name})"))

        # Random Forest (Preprocessing)
        rf_pipe = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))
        rf_pipe.fit(X_train, y_train)
        prep_rows.append(evaluate_model(rf_pipe, X_test, y_test, f"Random Forest (Preprocessing) ({ds_name})"))

        # Voting Classifier (Preprocessing)
        voting_pipe = make_pipeline(preprocessor, VotingClassifier([
            ('lr', LogisticRegression(random_state=42, max_iter=1000)),
            ('svm', SVC(kernel='rbf', random_state=42, probability=True)),
            ('knn', KNeighborsClassifier(n_neighbors=5))
        ], voting='soft'))
        voting_pipe.fit(X_train, y_train)
        prep_rows.append(evaluate_model(voting_pipe, X_test, y_test, f"Voting Classifier (Preprocessing) ({ds_name})"))

    prep_results = pd.DataFrame(prep_rows)

# Ringkasan hasil preprocessing
prep_sorted = prep_results.sort_values('F1-Score', ascending=False)
print("\n\nüìä RINGKASAN HASIL PREPROCESSING (3 model √ó 3 dataset):")
print(prep_sorted[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

# Visualisasi perbandingan
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
for idx, metric in enumerate(metrics):
    ax = axes[idx//2, idx%2]
    ax.bar(prep_sorted['Model'], prep_sorted[metric], color=['#3498db']*len(prep_sorted))
    ax.set_title(f'{metric} - Preprocessing (All Datasets)', fontweight='bold')
    ax.set_ylim([0, 1])
    ax.tick_params(axis='x', rotation=90)
    for i, v in enumerate(prep_sorted[metric]):
        ax.text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold', fontsize=8)

plt.tight_layout()
plt.show()


In [None]:
# ===========================
# D.1 HYPERPARAMETER TUNING (INTEGRATED)
# ===========================

print("\n" + "=" * 80)
print("HYPERPARAMETER TUNING (DIINTEGRASIKAN KE EVALUASI LENGKAP)")
print("=" * 80)

print("\nTuning dilakukan per-dataset di bagian evaluasi lengkap (B.5) ")
print("menggunakan pipeline + GridSearchCV untuk Logistic Regression dan Random Forest.")
print("Ringkasan hasil tuned lintas dataset tersedia di bagian D.2.")

# Untuk konsistensi dan kecepatan saat fallback, siapkan konfigurasi tuned yang dipakai:
print("\nKonfigurasi tuned yang digunakan (fallback cepat):")
print(" - Logistic Regression: C=1, penalty='l2', solver='lbfgs'")
print(" - Random Forest: n_estimators=200, max_depth=20, random_state=42")
print(" - Voting Classifier: LR(C=1), SVM(rbf, C=1, gamma='scale'), KNN(n_neighbors=7)")

# Siapkan objek model tuned (dapat dipakai ulang jika diperlukan)
lr_tuned = LogisticRegression(C=1, random_state=42, max_iter=1000)
rf_tuned = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
voting_tuned = VotingClassifier([
    ('lr', LogisticRegression(C=1, random_state=42, max_iter=1000)),
    ('svm', SVC(kernel='rbf', C=1, gamma='scale', random_state=42, probability=True)),
    ('knn', KNeighborsClassifier(n_neighbors=7))
], voting='soft')

print("\n‚úì Konfigurasi tuned siap. Lanjutkan ke D.2 untuk ringkasan hasil.")


In [None]:
# ===========================
# D.2 EVALUASI MODEL TUNED (SEMUA DATASET)
# ===========================

print("\n" + "=" * 80)
print("HASIL EVALUASI - TUNED ACROSS ALL DATASETS")
print("=" * 80)

# Jika tersedia, gunakan hasil lengkap; kalau tidak, hitung khusus bagian tuned
try:
    _ = all_results_full
    tuned_results = all_results_full[all_results_full['Model'].str.contains('(Tuned)', regex=False)].copy()
    print("\n‚úì Menggunakan hasil tuned dari evaluasi lengkap (27 kombinasi).")
except NameError:
    print("\n‚ö†Ô∏è all_results_full belum tersedia; menghitung ulang hasil tuned untuk semua dataset...")
    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler
    from sklearn.pipeline import make_pipeline

    # Siapkan dataset jika belum ada
    try:
        _ = df_normal
    except NameError:
        df_normal = df.copy()
    try:
        _ = df_undersample
        _ = df_oversample
    except NameError:
        rus = RandomUnderSampler(random_state=42)
        X_rus, y_rus = rus.fit_resample(df_normal.drop('Churn', axis=1), df_normal['Churn'])
        df_undersample = pd.concat([X_rus, y_rus], axis=1)
        smote = SMOTE(random_state=42)
        X_smote, y_smote = smote.fit_resample(df_normal.drop('Churn', axis=1), df_normal['Churn'])
        df_oversample = pd.concat([X_smote, y_smote], axis=1)

    datasets = [
        ("Normal", df_normal),
        ("Undersampling", df_undersample),
        ("Oversampling", df_oversample)
    ]

    tuned_rows = []
    for ds_name, ds in datasets:
        df_prep = ds.copy()
        y = (df_prep['Churn'] == 'Yes').astype(int) if df_prep['Churn'].dtype == 'object' else df_prep['Churn']
        X = df_prep.drop(['Churn', 'customerID'], axis=1, errors='ignore')

        cat_cols = X.select_dtypes(include=['object']).columns.tolist()
        num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
        if 'TotalCharges' in X.columns and X['TotalCharges'].dtype == 'object':
            X['TotalCharges'] = pd.to_numeric(X['TotalCharges'], errors='coerce')
            X['TotalCharges'].fillna(X['TotalCharges'].median(), inplace=True)
            if 'TotalCharges' not in num_cols:
                num_cols.append('TotalCharges')
            if 'TotalCharges' in cat_cols:
                cat_cols.remove('TotalCharges')

        preprocessor = ColumnTransformer([
            ('num', StandardScaler(), num_cols),
            ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), cat_cols)
        ])

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Tuned Logistic Regression
        lr_tuned = LogisticRegression(C=1, random_state=42, max_iter=1000)
        lr_tuned_pipe = make_pipeline(preprocessor, lr_tuned)
        lr_tuned_pipe.fit(X_train, y_train)
        tuned_rows.append(evaluate_model(lr_tuned_pipe, X_test, y_test, f"Logistic Regression (Tuned) ({ds_name})"))

        # Tuned Random Forest
        rf_tuned = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
        rf_tuned_pipe = make_pipeline(preprocessor, rf_tuned)
        rf_tuned_pipe.fit(X_train, y_train)
        tuned_rows.append(evaluate_model(rf_tuned_pipe, X_test, y_test, f"Random Forest (Tuned) ({ds_name})"))

        # Tuned Voting Classifier
        voting_tuned = VotingClassifier([
            ('lr', LogisticRegression(C=1, random_state=42, max_iter=1000)),
            ('svm', SVC(kernel='rbf', C=1, gamma='scale', random_state=42, probability=True)),
            ('knn', KNeighborsClassifier(n_neighbors=7))
        ], voting='soft')
        voting_tuned_pipe = make_pipeline(preprocessor, voting_tuned)
        voting_tuned_pipe.fit(X_train, y_train)
        tuned_rows.append(evaluate_model(voting_tuned_pipe, X_test, y_test, f"Voting Classifier (Tuned) ({ds_name})"))

    tuned_results = pd.DataFrame(tuned_rows)

# Ringkasan hasil tuned
tuned_sorted = tuned_results.sort_values('F1-Score', ascending=False)
print("\n\nüìä RINGKASAN HASIL TUNED (3 model √ó 3 dataset):")
print(tuned_sorted[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

# Visualisasi perbandingan
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
for idx, metric in enumerate(metrics):
    ax = axes[idx//2, idx%2]
    ax.bar(tuned_sorted['Model'], tuned_sorted[metric], color=['#9b59b6']*len(tuned_sorted))
    ax.set_title(f'{metric} - Tuned (All Datasets)', fontweight='bold')
    ax.set_ylim([0, 1])
    ax.tick_params(axis='x', rotation=90)
    for i, v in enumerate(tuned_sorted[metric]):
        ax.text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold', fontsize=8)

plt.tight_layout()
plt.show()


In [None]:
# ===========================
# E. COMPREHENSIVE COMPARISON
# ===========================

print("\n" + "=" * 80)
print("PERBANDINGAN SEMUA 27 MODEL (3 model √ó 3 tipe √ó 3 dataset)")
print("=" * 80)

# Pastikan all_results_full tersedia; kalau belum, hitung ulang 27 hasil secara otomatis
try:
    _ = all_results_full
    all_results = all_results_full.copy()
    print("\n‚úì Menggunakan hasil evaluasi lengkap yang sudah ada.")
except NameError:
    print("\n‚ö†Ô∏è all_results_full belum tersedia; menghitung ulang semua 27 kombinasi...")
    from sklearn.utils import shuffle
    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler
    from sklearn.pipeline import make_pipeline

    # Siapkan dataset jika belum ada
    try:
        _ = df_normal
    except NameError:
        df_normal = df.copy()
    try:
        _ = df_undersample
        _ = df_oversample
    except NameError:
        rus = RandomUnderSampler(random_state=42)
        X_rus, y_rus = rus.fit_resample(df_normal.drop('Churn', axis=1), df_normal['Churn'])
        df_undersample = pd.concat([X_rus, y_rus], axis=1)
        smote = SMOTE(random_state=42)
        X_smote, y_smote = smote.fit_resample(df_normal.drop('Churn', axis=1), df_normal['Churn'])
        df_oversample = pd.concat([X_smote, y_smote], axis=1)

    datasets = [
        ("Normal", df_normal),
        ("Undersampling", df_undersample),
        ("Oversampling", df_oversample)
    ]

    def prepare_balanced_data(df_bal):
        df_bal = shuffle(df_bal, random_state=42).reset_index(drop=True)
        y = (df_bal['Churn'] == 'Yes').astype(int) if df_bal['Churn'].dtype == 'object' else df_bal['Churn']
        X = df_bal.drop(['Churn', 'customerID'], axis=1, errors='ignore')
        X_encoded = X.copy()
        for col in X_encoded.columns:
            if X_encoded[col].dtype == 'object':
                X_encoded[col] = LabelEncoder().fit_transform(X_encoded[col])
        if 'TotalCharges' in X_encoded.columns and X_encoded['TotalCharges'].dtype == 'object':
            X_encoded['TotalCharges'] = pd.to_numeric(X_encoded['TotalCharges'], errors='coerce')
            X_encoded['TotalCharges'].fillna(X_encoded['TotalCharges'].median(), inplace=True)
        return train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y)

    # 9 hasil untuk Direct across datasets
    results_balanced = []
    for ds_name, ds in datasets:
        X_train, X_test, y_train, y_test = prepare_balanced_data(ds)
        lr = LogisticRegression(random_state=42, max_iter=1000)
        lr.fit(X_train, y_train)
        results_balanced.append(evaluate_model(lr, X_test, y_test, f"Logistic Regression (Direct) ({ds_name})"))
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X_train, y_train)
        results_balanced.append(evaluate_model(rf, X_test, y_test, f"Random Forest (Direct) ({ds_name})"))
        voting = VotingClassifier([
            ('lr', LogisticRegression(random_state=42, max_iter=1000)),
            ('svm', SVC(kernel='rbf', random_state=42, probability=True)),
            ('knn', KNeighborsClassifier(n_neighbors=5))
        ], voting='soft')
        voting.fit(X_train, y_train)
        results_balanced.append(evaluate_model(voting, X_test, y_test, f"Voting Classifier (Direct) ({ds_name})"))

    all_results_balanced = pd.DataFrame(results_balanced)

    # 18 hasil untuk Preprocessing & Tuned across datasets
    results_all = []
    for ds_name, ds in datasets:
        df_prep = ds.copy()
        y = (df_prep['Churn'] == 'Yes').astype(int) if df_prep['Churn'].dtype == 'object' else df_prep['Churn']
        X = df_prep.drop(['Churn', 'customerID'], axis=1, errors='ignore')
        cat_cols = X.select_dtypes(include=['object']).columns.tolist()
        num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
        if 'TotalCharges' in X.columns and X['TotalCharges'].dtype == 'object':
            X['TotalCharges'] = pd.to_numeric(X['TotalCharges'], errors='coerce')
            X['TotalCharges'].fillna(X['TotalCharges'].median(), inplace=True)
            if 'TotalCharges' not in num_cols:
                num_cols.append('TotalCharges')
            if 'TotalCharges' in cat_cols:
                cat_cols.remove('TotalCharges')
        preprocessor = ColumnTransformer([
            ('num', StandardScaler(), num_cols),
            ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), cat_cols)
        ])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        lr_pipe = make_pipeline(preprocessor, LogisticRegression(random_state=42, max_iter=1000))
        lr_pipe.fit(X_train, y_train)
        results_all.append(evaluate_model(lr_pipe, X_test, y_test, f"Logistic Regression (Preprocessing) ({ds_name})"))

        rf_pipe = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))
        rf_pipe.fit(X_train, y_train)
        results_all.append(evaluate_model(rf_pipe, X_test, y_test, f"Random Forest (Preprocessing) ({ds_name})"))

        voting_pipe = make_pipeline(preprocessor, VotingClassifier([
            ('lr', LogisticRegression(random_state=42, max_iter=1000)),
            ('svm', SVC(kernel='rbf', random_state=42, probability=True)),
            ('knn', KNeighborsClassifier(n_neighbors=5))
        ], voting='soft'))
        voting_pipe.fit(X_train, y_train)
        results_all.append(evaluate_model(voting_pipe, X_test, y_test, f"Voting Classifier (Preprocessing) ({ds_name})"))

        # Tuned variants (simple fixed best params for speed)
        lr_tuned = LogisticRegression(C=1, random_state=42, max_iter=1000)
        lr_tuned_pipe = make_pipeline(preprocessor, lr_tuned)
        lr_tuned_pipe.fit(X_train, y_train)
        results_all.append(evaluate_model(lr_tuned_pipe, X_test, y_test, f"Logistic Regression (Tuned) ({ds_name})"))

        rf_tuned = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
        rf_tuned_pipe = make_pipeline(preprocessor, rf_tuned)
        rf_tuned_pipe.fit(X_train, y_train)
        results_all.append(evaluate_model(rf_tuned_pipe, X_test, y_test, f"Random Forest (Tuned) ({ds_name})"))

        voting_tuned = VotingClassifier([
            ('lr', LogisticRegression(C=1, random_state=42, max_iter=1000)),
            ('svm', SVC(kernel='rbf', C=1, gamma='scale', random_state=42, probability=True)),
            ('knn', KNeighborsClassifier(n_neighbors=7))
        ], voting='soft')
        voting_tuned_pipe = make_pipeline(preprocessor, voting_tuned)
        voting_tuned_pipe.fit(X_train, y_train)
        results_all.append(evaluate_model(voting_tuned_pipe, X_test, y_test, f"Voting Classifier (Tuned) ({ds_name})"))

    all_results = pd.concat([all_results_balanced, pd.DataFrame(results_all)], ignore_index=True)
    # Simpan ke all_results_full untuk dipakai cell lain
    all_results_full = all_results.copy()
    print("‚úì Selesai menghitung ulang semua 27 kombinasi.")

# Sorting berdasarkan F1-Score
all_results_sorted = all_results.sort_values('F1-Score', ascending=False)

print("\nüìä RANKING SEMUA MODEL:")
print(all_results_sorted[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

# Identifikasi best model
best_model_idx = all_results['F1-Score'].idxmax()
best_model_info = all_results.loc[best_model_idx]

print("\n" + "=" * 80)
print("üèÜ BEST MODEL")
print("=" * 80)
print(f"Model: {best_model_info['Model']}")
print(f"Accuracy:  {best_model_info['Accuracy']:.4f}")
print(f"Precision: {best_model_info['Precision']:.4f}")
print(f"Recall:    {best_model_info['Recall']:.4f}")
print(f"F1-Score:  {best_model_info['F1-Score']:.4f}")

# Visualisasi comparison
fig, ax = plt.subplots(figsize=(18, 10))

x = np.arange(len(all_results_sorted))
width = 0.2

metrics_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors_plot = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']

for i, metric in enumerate(metrics_plot):
    ax.bar(x + i*width, all_results_sorted[metric], width, label=metric, color=colors_plot[i])

ax.set_xlabel('Model', fontweight='bold')
ax.set_ylabel('Score', fontweight='bold')
ax.set_title('Perbandingan Performa Semua 27 Model', fontweight='bold', fontsize=14)
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(all_results_sorted['Model'], rotation=90, ha='right', fontsize=8)
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Heatmap comparison
fig, ax = plt.subplots(figsize=(12, 16))
comparison_data = all_results_sorted[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']].set_index('Model')
sns.heatmap(comparison_data, annot=True, fmt='.3f', cmap='RdYlGn', vmin=0, vmax=1, ax=ax, cbar_kws={'label': 'Score'})
ax.set_title('Heatmap Performa Semua Model', fontweight='bold', fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
# ===========================
# F.1 SAVE BEST MODEL
# ===========================

import pickle
import joblib
from datetime import datetime

print("\n" + "=" * 80)
print("SAVING BEST MODEL FOR DEPLOYMENT")
print("=" * 80)

# Pilih best model berdasarkan F1-Score tertinggi dari all_results_full
best_idx = all_results_full['F1-Score'].idxmax()
best_model_info = all_results_full.loc[best_idx]
best_model_name = best_model_info['Model']

print(f"\nüèÜ Model terbaik berdasarkan F1-Score: {best_model_name}")
print(f"   F1-Score: {best_model_info['F1-Score']:.4f}")
print(f"   Accuracy: {best_model_info['Accuracy']:.4f}")
print(f"   Precision: {best_model_info['Precision']:.4f}")
print(f"   Recall: {best_model_info['Recall']:.4f}")

# Parse model name untuk menentukan model, tipe, dan dataset
# Format: "Model Name (Type) (Dataset)"
# Contoh: "Random Forest (Tuned) (Normal)"

# Extract model type dan dataset type dari nama
if "Logistic Regression" in best_model_name:
    model_type = "Logistic Regression"
elif "Random Forest" in best_model_name:
    model_type = "Random Forest"
elif "Voting Classifier" in best_model_name:
    model_type = "Voting Classifier"
else:
    model_type = "Unknown"

if "(Direct)" in best_model_name:
    processing_type = "Direct"
elif "(Preprocessing)" in best_model_name:
    processing_type = "Preprocessing"
elif "(Tuned)" in best_model_name:
    processing_type = "Tuned"
else:
    processing_type = "Unknown"

if "(Normal)" in best_model_name:
    dataset_type = "Normal"
elif "(Undersampling)" in best_model_name:
    dataset_type = "Undersampling"
elif "(Oversampling)" in best_model_name:
    dataset_type = "Oversampling"
else:
    dataset_type = "Normal"

print(f"\nüìã Konfigurasi Model:")
print(f"   Model: {model_type}")
print(f"   Processing: {processing_type}")
print(f"   Dataset: {dataset_type}")

# Pilih dataset yang sesuai
if dataset_type == "Normal":
    df_selected = df_normal
elif dataset_type == "Undersampling":
    df_selected = df_undersample
elif dataset_type == "Oversampling":
    df_selected = df_oversample
else:
    df_selected = df_normal

# Retrain best model configuration
print(f"\nüîß Retraining model dengan konfigurasi terbaik...")

if processing_type == "Direct":
    # Direct: Simple encoding
    df_train = df_selected.copy()
    y = (df_train['Churn'] == 'Yes').astype(int) if df_train['Churn'].dtype == 'object' else df_train['Churn']
    X = df_train.drop(['Churn', 'customerID'], axis=1, errors='ignore')
    X_encoded = X.copy()
    for col in X_encoded.columns:
        if X_encoded[col].dtype == 'object':
            X_encoded[col] = LabelEncoder().fit_transform(X_encoded[col])
    if 'TotalCharges' in X_encoded.columns and X_encoded['TotalCharges'].dtype == 'object':
        X_encoded['TotalCharges'] = pd.to_numeric(X_encoded['TotalCharges'], errors='coerce')
        X_encoded['TotalCharges'].fillna(X_encoded['TotalCharges'].median(), inplace=True)
    X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
        X_encoded, y, test_size=0.2, random_state=42, stratify=y
    )
    best_scaler = None
    feature_names_final = X_train_final.columns.tolist()
    
else:
    # Preprocessing or Tuned: Full pipeline
    df_train = df_selected.copy()
    y = (df_train['Churn'] == 'Yes').astype(int) if df_train['Churn'].dtype == 'object' else df_train['Churn']
    X = df_train.drop(['Churn', 'customerID'], axis=1, errors='ignore')
    
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    if 'TotalCharges' in X.columns and X['TotalCharges'].dtype == 'object':
        X['TotalCharges'] = pd.to_numeric(X['TotalCharges'], errors='coerce')
        X['TotalCharges'].fillna(X['TotalCharges'].median(), inplace=True)
        if 'TotalCharges' not in num_cols:
            num_cols.append('TotalCharges')
        if 'TotalCharges' in cat_cols:
            cat_cols.remove('TotalCharges')
    
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), cat_cols)
    ])
    
    X_train_split, X_test_split, y_train_final, y_test_final = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    X_train_final = preprocessor.fit_transform(X_train_split)
    X_test_final = preprocessor.transform(X_test_split)
    
    best_scaler = preprocessor
    # Get feature names after preprocessing
    num_feature_names = num_cols
    cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols).tolist()
    feature_names_final = num_feature_names + cat_feature_names

# Train final model
if model_type == "Logistic Regression":
    if processing_type == "Tuned":
        best_model_final = LogisticRegression(C=1, random_state=42, max_iter=1000)
    else:
        best_model_final = LogisticRegression(random_state=42, max_iter=1000)
elif model_type == "Random Forest":
    if processing_type == "Tuned":
        best_model_final = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=2, min_samples_leaf=1, random_state=42)
    else:
        best_model_final = RandomForestClassifier(n_estimators=100, random_state=42)
elif model_type == "Voting Classifier":
    if processing_type == "Tuned":
        best_model_final = VotingClassifier([
            ('lr', LogisticRegression(C=1, random_state=42, max_iter=1000)),
            ('svm', SVC(kernel='rbf', C=1, gamma='scale', random_state=42, probability=True)),
            ('knn', KNeighborsClassifier(n_neighbors=7))
        ], voting='soft')
    else:
        best_model_final = VotingClassifier([
            ('lr', LogisticRegression(random_state=42, max_iter=1000)),
            ('svm', SVC(kernel='rbf', random_state=42, probability=True)),
            ('knn', KNeighborsClassifier(n_neighbors=5))
        ], voting='soft')

best_model_final.fit(X_train_final, y_train_final)
print(f"   ‚úì Model retrained successfully")

# Verify model performance
y_pred_final = best_model_final.predict(X_test_final)
final_f1 = f1_score(y_test_final, y_pred_final)
print(f"   ‚úì Verification F1-Score: {final_f1:.4f}")

# Save model
model_path = "best_churn_model.pkl"
joblib.dump(best_model_final, model_path)
print(f"\n‚úì Model saved: {model_path}")

# Save scaler/preprocessor
scaler_path = "scaler.pkl"
if best_scaler is not None:
    joblib.dump(best_scaler, scaler_path)
    print(f"‚úì Preprocessor saved: {scaler_path}")
else:
    print(f"‚úì No preprocessor needed (Direct encoding)")

# Save feature names
feature_names_path = "feature_names.pkl"
joblib.dump(feature_names_final, feature_names_path)
print(f"‚úì Feature names saved: {feature_names_path}")

# Save model metadata
metadata = {
    'model_name': best_model_name,
    'model_type': model_type,
    'processing_type': processing_type,
    'dataset_type': dataset_type,
    'f1_score': best_model_info['F1-Score'],
    'accuracy': best_model_info['Accuracy'],
    'precision': best_model_info['Precision'],
    'recall': best_model_info['Recall'],
    'trained_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}
metadata_path = "model_metadata.pkl"
joblib.dump(metadata, metadata_path)
print(f"‚úì Metadata saved: {metadata_path}")

print("\n" + "=" * 80)
print("‚úÖ DEPLOYMENT FILES READY!")
print("=" * 80)
print(f"\nBest Model Configuration:")
print(f"  üìå {best_model_name}")
print(f"  üìä F1-Score: {best_model_info['F1-Score']:.4f}")
print(f"\nFiles untuk Streamlit deployment:")
print(f"  1. {model_path}")
print(f"  2. {scaler_path}")
print(f"  3. {feature_names_path}")
print(f"  4. {metadata_path}")
