In [None]:
# ===========================
# 1. IMPORT LIBRARIES
# ===========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Model Klasifikasi
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC

# Evaluasi Model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

# Set style untuk visualisasi
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úì Library import successful!")


In [None]:
# ===========================
# A.1 LOAD DATA DAN EKSPLORASI AWAL
# ===========================

# Load dataset dari Kaggle
import kagglehub
path = kagglehub.dataset_download("blastchar/telco-customer-churn")
csv_path = f"{path}/WA_Fn-UseC_-Telco-Customer-Churn.csv"

# Baca dataset
df = pd.read_csv(csv_path)

print("=" * 80)
print("EKSPLORASI AWAL DATASET")
print("=" * 80)

# 1. Tampilkan 5 baris pertama
print("\n1Ô∏è‚É£ Lima Baris Pertama Dataset:")
print(df.head())

# 2. Info Dataset
print("\n2Ô∏è‚É£ Informasi Dataset:")
print(df.info())

# 3. Statistik Deskriptif
print("\n3Ô∏è‚É£ Statistik Deskriptif:")
print(df.describe())

print(f"\nüìä Dimensi Dataset: {df.shape[0]} baris, {df.shape[1]} kolom")


In [None]:
# ===========================
# A.2 IDENTIFIKASI MISSING VALUE
# ===========================

print("\n" + "=" * 80)
print("IDENTIFIKASI MISSING VALUE")
print("=" * 80)

# Hitung missing value
missing_data = pd.DataFrame({
    'Kolom': df.columns,
    'Missing_Count': df.isnull().sum().values,
    'Missing_Percentage': (df.isnull().sum().values / len(df) * 100).round(2)
})
missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

if len(missing_data) == 0:
    print("\n‚úì Tidak ada missing value dalam dataset!")
else:
    print("\nMissing Value Ditemukan:")
    print(missing_data)
    
    # Visualisasi missing value
    fig, ax = plt.subplots(figsize=(10, 6))
    missing_data_plot = df.isnull().sum()
    missing_data_plot = missing_data_plot[missing_data_plot > 0].sort_values(ascending=False)
    missing_data_plot.plot(kind='barh', ax=ax, color='coral')
    ax.set_xlabel('Jumlah Missing Value')
    ax.set_title('Distribusi Missing Value per Kolom')
    plt.tight_layout()
    plt.show()


In [None]:
# ===========================
# A.2B IDENTIFIKASI DAN HAPUS DUPLICATE DATA
# ===========================

print("\n" + "=" * 80)
print("IDENTIFIKASI DAN HAPUS DUPLICATE DATA")
print("=" * 80)

# Cek duplicate berdasarkan semua kolom
duplicate_all = df.duplicated().sum()
print(f"\n1Ô∏è‚É£ Duplikat (semua kolom): {duplicate_all} baris")

# Cek duplicate berdasarkan customerID (unique identifier)
if 'customerID' in df.columns:
    duplicate_id = df.duplicated(subset=['customerID'], keep=False).sum()
    print(f"2Ô∏è‚É£ Duplikat berdasarkan customerID: {duplicate_id} baris")
    
    if duplicate_id > 0:
        print("\n   Menampilkan duplikat customerID:")
        dup_customers = df[df.duplicated(subset=['customerID'], keep=False)].sort_values('customerID')
        print(dup_customers[['customerID', 'tenure', 'MonthlyCharges']].head(10))

# Hapus duplikat
print("\n3Ô∏è‚É£ Menghapus duplikat...")
df_before = len(df)
df = df.drop_duplicates()
df_after = len(df)
rows_removed = df_before - df_after

print(f"   Baris sebelum: {df_before}")
print(f"   Baris sesudah: {df_after}")
print(f"   ‚úì Duplikat yang dihapus: {rows_removed} baris")

# Jika ada duplikat customerID setelah drop_duplicates, hapus berdasarkan ID
if 'customerID' in df.columns:
    df = df.drop_duplicates(subset=['customerID'], keep='first')
    print(f"   ‚úì Dataset setelah drop duplikat customerID: {len(df)} baris")

print(f"\n‚úì Dataset cleaning complete! Shape: {df.shape}")


In [None]:
# ===========================
# A.3 VISUALISASI DISTRIBUSI TARGET (CHURN)
# ===========================

print("\n" + "=" * 80)
print("ANALISIS VARIABEL TARGET (CHURN)")
print("=" * 80)

# Distribusi Churn
churn_counts = df['Churn'].value_counts()
churn_percentage = df['Churn'].value_counts(normalize=True) * 100

print("\nDistribusi Target Churn:")
print(f"  No:  {churn_counts['No']} ({churn_percentage['No']:.2f}%)")
print(f"  Yes: {churn_counts['Yes']} ({churn_percentage['Yes']:.2f}%)")

# Visualisasi
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
churn_counts.plot(kind='bar', ax=axes[0], color=['green', 'red'], alpha=0.7)
axes[0].set_title('Distribusi Churn (Count)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Jumlah Pelanggan')
axes[0].set_xlabel('Churn Status')
axes[0].set_xticklabels(['No', 'Yes'], rotation=0)

# Pie chart
colors = ['#2ecc71', '#e74c3c']
axes[1].pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%', 
            colors=colors, startangle=90)
axes[1].set_title('Proporsi Churn', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

# Cek class imbalance
imbalance_ratio = churn_counts['Yes'] / churn_counts['No']
print(f"\n‚ö†Ô∏è Class Imbalance Ratio: {imbalance_ratio:.3f}")
if imbalance_ratio < 0.3:
    print("   Status: Dataset memiliki imbalance yang signifikan")
else:
    print("   Status: Dataset cukup seimbang")


In [None]:
# ===========================
# A.4 ANALISIS KORELASI
# ===========================

print("\n" + "=" * 80)
print("ANALISIS KORELASI FITUR NUMERIK")
print("=" * 80)

# Pilih fitur numerik
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nFitur Numerik yang Ditemukan: {numeric_features}")

# Buat correlation matrix
correlation_matrix = df[numeric_features].corr()

# Visualisasi heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Heatmap Korelasi Fitur Numerik', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Tampilkan korelasi dengan target jika numeric
if 'Churn' in df.columns:
    # Encode target untuk analisis korelasi
    df_temp = df.copy()
    df_temp['Churn_encoded'] = (df_temp['Churn'] == 'Yes').astype(int)
    
    target_correlation = df_temp[numeric_features + ['Churn_encoded']].corr()['Churn_encoded'].sort_values(ascending=False)
    print("\nKorelasi dengan Churn (Target):")
    print(target_correlation)


In [None]:
# ===========================
# B.1 PERSIAPAN DATA UNTUK DIRECT MODELING
# ===========================

print("\n" + "=" * 80)
print("DIRECT MODELING - TANPA PREPROCESSING")
print("=" * 80)

# Buat copy dataset
df_direct = df.copy()

# Encode target variable
df_direct['Churn'] = (df_direct['Churn'] == 'Yes').astype(int)

# Pisahkan X dan y
y = df_direct['Churn']
X = df_direct.drop(['Churn', 'customerID'], axis=1)

# Convert categorical to numeric (simple encoding tanpa scaling)
X_encoded = X.copy()
for col in X_encoded.columns:
    if X_encoded[col].dtype == 'object':
        X_encoded[col] = LabelEncoder().fit_transform(X_encoded[col])

# Handle TotalCharges yang mungkin string
if X_encoded['TotalCharges'].dtype == 'object':
    X_encoded['TotalCharges'] = pd.to_numeric(X_encoded['TotalCharges'], errors='coerce')
    X_encoded['TotalCharges'].fillna(X_encoded['TotalCharges'].median(), inplace=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n‚úì Data siap untuk modeling")
print(f"  Training set: {X_train.shape}")
print(f"  Testing set: {X_test.shape}")
print(f"  Features: {X_train.shape[1]}")


In [None]:
# ===========================
# B.2 BUILD & TRAIN MODELS (DIRECT)
# ===========================

print("\n" + "=" * 80)
print("TRAINING 3 MODEL CATEGORIES (DIRECT - NO PREPROCESSING)")
print("=" * 80)

# Kategori 1: Model Konvensional (Logistic Regression)
print("\n1Ô∏è‚É£ KONVENSIONAL MODEL: Logistic Regression")
lr_direct = LogisticRegression(random_state=42, max_iter=1000)
lr_direct.fit(X_train, y_train)
print("   ‚úì Model trained")

# Kategori 2: Ensemble Bagging (Random Forest)
print("\n2Ô∏è‚É£ ENSEMBLE BAGGING: Random Forest")
rf_direct = RandomForestClassifier(n_estimators=100, random_state=42)
rf_direct.fit(X_train, y_train)
print("   ‚úì Model trained")

# Kategori 3: Ensemble Voting
print("\n3Ô∏è‚É£ ENSEMBLE VOTING: Kombinasi LR, SVM, KNN")
lr_vote = LogisticRegression(random_state=42, max_iter=1000)
svm_vote = SVC(kernel='rbf', random_state=42, probability=True)
knn_vote = KNeighborsClassifier(n_neighbors=5)

voting_direct = VotingClassifier(
    estimators=[('lr', lr_vote), ('svm', svm_vote), ('knn', knn_vote)],
    voting='soft'
)
voting_direct.fit(X_train, y_train)
print("   ‚úì Model trained")

print("\n‚úì Semua model berhasil ditraining!")


In [None]:
# ===========================
# B.3 EVALUASI MODEL (DIRECT)
# ===========================

def evaluate_model(model, X_test, y_test, model_name):
    """Fungsi untuk evaluasi model"""
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"\n{model_name}")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"  Confusion Matrix:\n{cm}")
    
    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion_Matrix': cm
    }

print("\n" + "=" * 80)
print("HASIL EVALUASI - DIRECT MODELING")
print("=" * 80)

results_direct = []
results_direct.append(evaluate_model(lr_direct, X_test, y_test, "Logistic Regression (Direct)"))
results_direct.append(evaluate_model(rf_direct, X_test, y_test, "Random Forest (Direct)"))
results_direct.append(evaluate_model(voting_direct, X_test, y_test, "Voting Classifier (Direct)"))

# Buat dataframe hasil
df_results_direct = pd.DataFrame(results_direct)
print("\n\nüìä RINGKASAN HASIL DIRECT MODELING:")
print(df_results_direct[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

# Visualisasi perbandingan
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
for idx, metric in enumerate(metrics):
    ax = axes[idx//2, idx%2]
    ax.bar(df_results_direct['Model'], df_results_direct[metric], color=['#3498db', '#e74c3c', '#2ecc71'])
    ax.set_title(f'{metric} - Direct Modeling', fontweight='bold')
    ax.set_ylim([0, 1])
    ax.tick_params(axis='x', rotation=45)
    for i, v in enumerate(df_results_direct[metric]):
        ax.text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()


## C. MODELING DENGAN PREPROCESSING

In [None]:
# ===========================
# C.1 DATA PREPROCESSING
# ===========================

print("\n" + "=" * 80)
print("PREPROCESSING DATA")
print("=" * 80)

df_prep = df.copy()

# 1. Handle TotalCharges (convert to numeric)
print("\n1Ô∏è‚É£ Handling TotalCharges...")
df_prep['TotalCharges'] = pd.to_numeric(df_prep['TotalCharges'], errors='coerce')
df_prep['TotalCharges'].fillna(df_prep['TotalCharges'].median(), inplace=True)
print("   ‚úì TotalCharges converted to numeric")

# 2. Drop tidak relevan columns
print("\n2Ô∏è‚É£ Dropping irrelevant columns...")
df_prep = df_prep.drop(['customerID'], axis=1)
print("   ‚úì customerID dropped")

# 3. Encode target
print("\n3Ô∏è‚É£ Encoding target variable...")
df_prep['Churn'] = (df_prep['Churn'] == 'Yes').astype(int)
print("   ‚úì Churn encoded")

# 4. Pisahkan numeric dan categorical
print("\n4Ô∏è‚É£ Separating numeric and categorical features...")
numeric_cols = df_prep.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df_prep.select_dtypes(include=['object']).columns.tolist()

print(f"   Numeric columns: {numeric_cols}")
print(f"   Categorical columns: {categorical_cols}")

# 5. One-Hot Encoding untuk categorical
print("\n5Ô∏è‚É£ Applying One-Hot Encoding...")
df_prep_encoded = pd.get_dummies(df_prep, columns=categorical_cols, drop_first=True)
print(f"   ‚úì Features after encoding: {df_prep_encoded.shape[1]}")

# 6. Pisahkan X dan y
y_prep = df_prep_encoded['Churn']
X_prep = df_prep_encoded.drop(['Churn'], axis=1)

# 7. Scaling
print("\n6Ô∏è‚É£ Feature Scaling (StandardScaler)...")
scaler = StandardScaler()
X_prep_scaled = scaler.fit_transform(X_prep)
X_prep_scaled = pd.DataFrame(X_prep_scaled, columns=X_prep.columns)
print("   ‚úì Features scaled")

# Train-test split
X_train_prep, X_test_prep, y_train_prep, y_test_prep = train_test_split(
    X_prep_scaled, y_prep, test_size=0.2, random_state=42, stratify=y_prep
)

print(f"\n‚úì Preprocessing complete!")
print(f"  Training set: {X_train_prep.shape}")
print(f"  Testing set: {X_test_prep.shape}")
print(f"  Total features: {X_train_prep.shape[1]}")


In [None]:
# ===========================
# C.2 BUILD & TRAIN MODELS (WITH PREPROCESSING)
# ===========================

print("\n" + "=" * 80)
print("TRAINING MODELS WITH PREPROCESSING")
print("=" * 80)

# Model 1: Logistic Regression
print("\n1Ô∏è‚É£ Logistic Regression (with preprocessing)")
lr_prep = LogisticRegression(random_state=42, max_iter=1000)
lr_prep.fit(X_train_prep, y_train_prep)
print("   ‚úì Model trained")

# Model 2: Random Forest
print("\n2Ô∏è‚É£ Random Forest (with preprocessing)")
rf_prep = RandomForestClassifier(n_estimators=100, random_state=42)
rf_prep.fit(X_train_prep, y_train_prep)
print("   ‚úì Model trained")

# Model 3: Voting Classifier
print("\n3Ô∏è‚É£ Voting Classifier (with preprocessing)")
voting_prep = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42, max_iter=1000)),
        ('svm', SVC(kernel='rbf', random_state=42, probability=True)),
        ('knn', KNeighborsClassifier(n_neighbors=5))
    ],
    voting='soft'
)
voting_prep.fit(X_train_prep, y_train_prep)
print("   ‚úì Model trained")

print("\n‚úì Semua model dengan preprocessing berhasil ditraining!")


In [None]:
# ===========================
# C.3 EVALUASI MODEL (WITH PREPROCESSING)
# ===========================

print("\n" + "=" * 80)
print("HASIL EVALUASI - WITH PREPROCESSING")
print("=" * 80)

results_prep = []
results_prep.append(evaluate_model(lr_prep, X_test_prep, y_test_prep, "Logistic Regression (Preprocessing)"))
results_prep.append(evaluate_model(rf_prep, X_test_prep, y_test_prep, "Random Forest (Preprocessing)"))
results_prep.append(evaluate_model(voting_prep, X_test_prep, y_test_prep, "Voting Classifier (Preprocessing)"))

# Buat dataframe hasil
df_results_prep = pd.DataFrame(results_prep)
print("\n\nüìä RINGKASAN HASIL WITH PREPROCESSING:")
print(df_results_prep[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

# Visualisasi perbandingan
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
for idx, metric in enumerate(metrics):
    ax = axes[idx//2, idx%2]
    ax.bar(df_results_prep['Model'], df_results_prep[metric], color=['#3498db', '#e74c3c', '#2ecc71'])
    ax.set_title(f'{metric} - With Preprocessing', fontweight='bold')
    ax.set_ylim([0, 1])
    ax.tick_params(axis='x', rotation=45)
    for i, v in enumerate(df_results_prep[metric]):
        ax.text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()


## D. HYPERPARAMETER TUNING

In [None]:
# ===========================
# D.1 HYPERPARAMETER TUNING
# ===========================

print("\n" + "=" * 80)
print("HYPERPARAMETER TUNING")
print("=" * 80)

# Model 1: Logistic Regression Tuning
print("\n1Ô∏è‚É£ Tuning Logistic Regression...")
lr_params = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}

lr_grid = GridSearchCV(LogisticRegression(random_state=42, max_iter=1000), 
                       lr_params, cv=5, scoring='f1', n_jobs=-1)
lr_grid.fit(X_train_prep, y_train_prep)
lr_tuned = lr_grid.best_estimator_

print(f"   Best params: {lr_grid.best_params_}")
print(f"   Best CV score: {lr_grid.best_score_:.4f}")

# Model 2: Random Forest Tuning
print("\n2Ô∏è‚É£ Tuning Random Forest...")
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), 
                       rf_params, cv=5, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train_prep, y_train_prep)
rf_tuned = rf_grid.best_estimator_

print(f"   Best params: {rf_grid.best_params_}")
print(f"   Best CV score: {rf_grid.best_score_:.4f}")

# Model 3: Voting Classifier Tuning (tune base estimators)
print("\n3Ô∏è‚É£ Tuning Voting Classifier...")
voting_tuned = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(C=1, random_state=42, max_iter=1000)),
        ('svm', SVC(kernel='rbf', C=1, gamma='scale', random_state=42, probability=True)),
        ('knn', KNeighborsClassifier(n_neighbors=7))
    ],
    voting='soft'
)
voting_tuned.fit(X_train_prep, y_train_prep)
print(f"   ‚úì Voting Classifier tuned")

print("\n‚úì Hyperparameter tuning complete!")


In [None]:
# ===========================
# D.2 EVALUASI MODEL AFTER TUNING
# ===========================

print("\n" + "=" * 80)
print("HASIL EVALUASI - AFTER HYPERPARAMETER TUNING")
print("=" * 80)

results_tuned = []
results_tuned.append(evaluate_model(lr_tuned, X_test_prep, y_test_prep, "Logistic Regression (Tuned)"))
results_tuned.append(evaluate_model(rf_tuned, X_test_prep, y_test_prep, "Random Forest (Tuned)"))
results_tuned.append(evaluate_model(voting_tuned, X_test_prep, y_test_prep, "Voting Classifier (Tuned)"))

# Buat dataframe hasil
df_results_tuned = pd.DataFrame(results_tuned)
print("\n\nüìä RINGKASAN HASIL AFTER TUNING:")
print(df_results_tuned[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

# Visualisasi perbandingan
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
for idx, metric in enumerate(metrics):
    ax = axes[idx//2, idx%2]
    ax.bar(df_results_tuned['Model'], df_results_tuned[metric], color=['#9b59b6', '#f39c12', '#1abc9c'])
    ax.set_title(f'{metric} - After Tuning', fontweight='bold')
    ax.set_ylim([0, 1])
    ax.tick_params(axis='x', rotation=45)
    for i, v in enumerate(df_results_tuned[metric]):
        ax.text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()


## E. PERBANDINGAN SEMUA SCENARIO

In [None]:
# ===========================
# E. COMPREHENSIVE COMPARISON
# ===========================

print("\n" + "=" * 80)
print("PERBANDINGAN SEMUA 9 MODEL (3 kategori √ó 3 scenario)")
print("=" * 80)

# Gabungkan semua hasil
all_results = pd.concat([df_results_direct, df_results_prep, df_results_tuned], ignore_index=True)

# Sorting berdasarkan F1-Score
all_results_sorted = all_results.sort_values('F1-Score', ascending=False)

print("\nüìä RANKING SEMUA MODEL:")
print(all_results_sorted[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

# Identifikasi best model
best_model_idx = all_results['F1-Score'].idxmax()
best_model_info = all_results.loc[best_model_idx]

print("\n" + "=" * 80)
print("üèÜ BEST MODEL")
print("=" * 80)
print(f"Model: {best_model_info['Model']}")
print(f"Accuracy:  {best_model_info['Accuracy']:.4f}")
print(f"Precision: {best_model_info['Precision']:.4f}")
print(f"Recall:    {best_model_info['Recall']:.4f}")
print(f"F1-Score:  {best_model_info['F1-Score']:.4f}")

# Visualisasi comparison
fig, ax = plt.subplots(figsize=(14, 8))

x = np.arange(len(all_results_sorted))
width = 0.2

metrics_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors_plot = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']

for i, metric in enumerate(metrics_plot):
    ax.bar(x + i*width, all_results_sorted[metric], width, label=metric, color=colors_plot[i])

ax.set_xlabel('Model', fontweight='bold')
ax.set_ylabel('Score', fontweight='bold')
ax.set_title('Perbandingan Performa Semua 9 Model', fontweight='bold', fontsize=14)
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(all_results_sorted['Model'], rotation=45, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Heatmap comparison
fig, ax = plt.subplots(figsize=(10, 6))
comparison_data = all_results_sorted[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']].set_index('Model')
sns.heatmap(comparison_data, annot=True, fmt='.3f', cmap='RdYlGn', vmin=0, vmax=1, ax=ax, cbar_kws={'label': 'Score'})
ax.set_title('Heatmap Performa Semua Model', fontweight='bold', fontsize=14)
plt.tight_layout()
plt.show()


## F. DEPLOYMENT MODEL (Save & Streamlit Prep)

In [None]:
# ===========================
# F.1 SAVE BEST MODEL
# ===========================

import pickle
import joblib
from datetime import datetime

print("\n" + "=" * 80)
print("SAVING BEST MODEL FOR DEPLOYMENT")
print("=" * 80)

# Tentukan best model untuk digunakan di Streamlit
# Gunakan tuned Random Forest (biasanya performa terbaik)
best_model_deploy = rf_tuned
model_name_deploy = "Random Forest (Tuned)"

print(f"\n‚úì Model untuk deployment: {model_name_deploy}")

# Save model
model_path = "best_churn_model.pkl"
joblib.dump(best_model_deploy, model_path)
print(f"‚úì Model saved: {model_path}")

# Save scaler
scaler_path = "scaler.pkl"
joblib.dump(scaler, scaler_path)
print(f"‚úì Scaler saved: {scaler_path}")

# Save feature names
feature_names_path = "feature_names.pkl"
feature_names = X_prep_scaled.columns.tolist()
joblib.dump(feature_names, feature_names_path)
print(f"‚úì Feature names saved: {feature_names_path}")

print("\n‚úì Semua file untuk deployment sudah tersimpan!")
print("\nFile yang dibutuhkan untuk Streamlit:")
print(f"  1. {model_path}")
print(f"  2. {scaler_path}")
print(f"  3. {feature_names_path}")
print(f"  4. Dataset original (untuk reference encoding)")
