# Import Libraries dan Setup Lingkungan

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, roc_auc_score, accuracy_score,
                            f1_score, confusion_matrix, recall_score, precision_score)
import joblib
import warnings
warnings.filterwarnings('ignore')

# Styling untuk visualisasi
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Semua library berhasil diimport!")

Semua library berhasil diimport!



# Load Dataset & Create Churn Label (Target)


In [17]:
print("\n[1/6] Loading Dataset...")
df = pd.read_csv('/content/ac-01_telco_customer_behavior_mock_data.csv', sep=';')
print(f"‚úì Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

print("\n[2/6] Creating Churn Label (LOGIKA DENGAN NOISE)...")

df['churn'] = 0
avg_monthly_spend = df['monthly_spend'].mean()

# 1. Tentukan KANDIDAT Churn (Menggunakan Logika Anda)
churn_candidates = df.loc[(df['monthly_spend'] < avg_monthly_spend) &
                          ((df['sms_freq'] <= 1) | (df['avg_call_duration'] < 2))]

# 2. Hanya 80% dari kandidat ini yang dianggap Churn (memperkenalkan 'noise')
churn_index = churn_candidates.sample(frac=0.8, random_state=42).index
df.loc[churn_index, 'churn'] = 1

# 3. Churn tambahan acak (20% dari kandidat yang tersisa) untuk menambah ketidaksempurnaan
random_churn_index = churn_candidates.drop(churn_index).sample(frac=0.2, random_state=42).index
df.loc[random_churn_index, 'churn'] = 1


print(f"‚úì Churn label created (WITH NOISE)")
print(f"  - Non-Churn: {(df['churn'] == 0).sum()} ({(df['churn'] == 0).sum()/len(df)*100:.1f}%)")
print(f"  - Churn: {(df['churn'] == 1).sum()} ({(df['churn'] == 1).sum()/len(df)*100:.1f}%)")

leakage_features = []
print(f"\n‚úÖ TIDAK ADA leakage features untuk logika baru.")


[1/6] Loading Dataset...
‚úì Dataset loaded: 10000 rows, 12 columns

[2/6] Creating Churn Label (LOGIKA DENGAN NOISE)...
‚úì Churn label created (WITH NOISE)
  - Non-Churn: 9734 (97.3%)
  - Churn: 266 (2.7%)

‚úÖ TIDAK ADA leakage features untuk logika baru.


## DATA PREPROCESSING & FEATURE ENGINEERING

In [18]:
print("\n[3/6] Preprocessing Data (STRICT ANTI-LEAKAGE)...")

# Daftar kolom yang harus di-DROP untuk MENCEGAH LEAKAGE:
# 1. Identifier/Target: customer_id, target_offer, churn
# 2. Fitur yang Sangat Terkait dengan Logika Churn (Proksi Spend/Usage):
features_to_exclude = ['avg_data_usage_gb', 'topup_freq', 'avg_call_duration', 'sms_freq']

cols_to_drop = ['customer_id', 'target_offer', 'churn'] + features_to_exclude
X = df.drop(cols_to_drop, axis=1)
# ----------------------------------------------------------------------

# Definisikan features dan target
X = df.drop(cols_to_drop, axis=1)
y = df['churn']

print(f"‚úì Features for training (Strict): {X.columns.tolist()}")
print(f"  X shape: {X.shape}, y shape: {y.shape}")

# Encode categorical variables
X_encoded = X.copy()
label_encoders = {}
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])
    label_encoders[col] = le

# Scale features
scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X_encoded),
    columns=X_encoded.columns,
    index=X_encoded.index
)

# Train-test split (stratified to maintain class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"‚úì Data split completed")
print(f"  - Training set: {X_train.shape[0]} samples")
print(f"  - Test set: {X_test.shape[0]} samples")


[3/6] Preprocessing Data (STRICT ANTI-LEAKAGE)...
‚úì Features for training (Strict): ['plan_type', 'device_brand', 'pct_video_usage', 'monthly_spend', 'travel_score', 'complaint_count']
  X shape: (10000, 6), y shape: (10000,)
‚úì Data split completed
  - Training set: 8000 samples
  - Test set: 2000 samples


###Persistensi Objek Preprocessing (Joblib Dump)

In [19]:
import joblib

print("--- Menyimpan Objek Preprocessing untuk Backend ---")

# 1. Menyimpan Scaler (Objek StandardScaler)
joblib.dump(scaler, 'scaler.pkl')
print("‚úÖ Scaler object saved to scaler.pkl")

# 2. Menyimpan Label Encoders (Kamus yang berisi semua LabelEncoder)
joblib.dump(label_encoders, 'label_encoders.pkl')
print("‚úÖ Label Encoders saved to label_encoders.pkl")

--- Menyimpan Objek Preprocessing untuk Backend ---
‚úÖ Scaler object saved to scaler.pkl
‚úÖ Label Encoders saved to label_encoders.pkl


## MODEL TRAINING & PREDICTION GENERATION

In [20]:
print("\n[4/6] Training Random Forest Model...")

# üöÄ PERBAIKAN: class_weight='balanced'
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=3,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# Train the model
rf_model.fit(X_train, y_train)
print("‚úì Model training completed")

# Generate churn risk scores for entire dataset
churn_risk_score = rf_model.predict_proba(X_scaled)[:, 1]
df['churn_risk'] = churn_risk_score

print(f"‚úì Churn risk scores generated")
print(f"  - Mean risk: {churn_risk_score.mean():.3f}")
print(f"  - Risk range: [{churn_risk_score.min():.3f}, {churn_risk_score.max():.3f}]")


[4/6] Training Random Forest Model...
‚úì Model training completed
‚úì Churn risk scores generated
  - Mean risk: 0.231
  - Risk range: [0.000, 0.784]


## 5. VALIDASI KINERJA & PENYESUAIAN THRESHOLD

In [21]:
print("\n[5/6] Evaluating Model Performance...")

# Dapatkan Probabilitas untuk Class 1 (Churn)
y_pred_proba_test = rf_model.predict_proba(X_test)[:, 1]

# üöÄ PERBAIKAN: SESUAIKAN THRESHOLD
threshold = 0.4 # Coba nilai yang lebih rendah dari 0.5 untuk meningkatkan Recall
# Terapkan Threshold Baru untuk mendapatkan prediksi biner
y_pred_test = (y_pred_proba_test >= threshold).astype(int)

# Calculate metrics (menggunakan y_pred_test yang baru)
accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='weighted')
recall = recall_score(y_test, y_pred_test, average='weighted')
f1 = f1_score(y_test, y_pred_test, average='weighted')
auc_roc = roc_auc_score(y_test, y_pred_proba_test) # AUC-ROC tetap dihitung dari probabilitas

# Display metrics
print("\n" + "=" * 80)
print(f"MODEL PERFORMANCE METRICS (THRESHOLD {threshold:.2f})")
print("=" * 80)
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)\n(Perhatian: Akurasi menurun karena Recall diprioritaskan)")
print(f"F1-Score:  {f1:.4f}")
print(f"AUC-ROC:   {auc_roc:.4f} (Kualitas Skor Probabilitas)")
print("=" * 80)

# Detailed classification report
print("\n" + "-" * 80)
print("CLASSIFICATION REPORT")
print("-" * 80)
report = classification_report(y_test, y_pred_test,
                               target_names=['No Churn (0)', 'Churn (1)'],
                               digits=4, output_dict=True)
print(classification_report(y_test, y_pred_test,
                          target_names=['No Churn (0)', 'Churn (1)'],
                          digits=4))

# üéØ FOKUS: RECALL KELAS CHURN (1)
recall_churn = report['Churn (1)']['recall']
print(f"\n‚úÖ RECALL KHUSUS KELAS CHURN (1): {recall_churn:.4f} (TARGET >= 0.70)")
print("-" * 80)

# Confusion Matrix (hanya perhitungan)
cm = confusion_matrix(y_test, y_pred_test)
print("\n" + "-" * 80)
print("CONFUSION MATRIX")
print("-" * 80)
print(f"True Negatives:  {cm[0,0]:>5d} | False Positives: {cm[0,1]:>5d}")
print(f"False Negatives: {cm[1,0]:>5d} | True Positives:  {cm[1,1]:>5d}")
print("-" * 80)


[5/6] Evaluating Model Performance...

MODEL PERFORMANCE METRICS (THRESHOLD 0.40)
Accuracy:  0.7045 (70.45%)
(Perhatian: Akurasi menurun karena Recall diprioritaskan)
F1-Score:  0.8041
AUC-ROC:   0.7304 (Kualitas Skor Probabilitas)

--------------------------------------------------------------------------------
CLASSIFICATION REPORT
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

No Churn (0)     0.9850    0.7072    0.8233      1947
   Churn (1)     0.0532    0.6038    0.0977        53

    accuracy                         0.7045      2000
   macro avg     0.5191    0.6555    0.4605      2000
weighted avg     0.9603    0.7045    0.8041      2000


‚úÖ RECALL KHUSUS KELAS CHURN (1): 0.6038 (TARGET >= 0.70)
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
CONFUSION MATRIX
---------------

## ARTEFAK

In [22]:
print("\n[6/6] Saving Model...")
model_filename = 'rf_churn_risk_model.pkl'

try:
    joblib.dump(rf_model, model_filename)
    print(f"‚úì Model saved to: {model_filename}")

    # Verify model can be loaded
    loaded_model = joblib.load(model_filename)
    print(f"‚úì Model successfully loaded and verified")
    model_saved = True
except Exception as e:
    print(f"‚ö†Ô∏è  Warning: Could not save model - {str(e)}")
    model_saved = False


[6/6] Saving Model...
‚úì Model saved to: rf_churn_risk_model.pkl
‚úì Model successfully loaded and verified



# Evaluation dan Final Output



In [23]:
print("\n" + "=" * 80)
print("PIPELINE COMPLETED SUCCESSFULLY!")
print("=" * 80)
print(f"‚úì Dataset processed: {df.shape[0]} customers")
print(f"‚úì Churn risk scores added to dataframe")
if model_saved:
    print(f"‚úì Model saved: {model_filename}")
print(f"‚úì Test Accuracy: {accuracy:.4f}")
print(f"‚úì Test AUC-ROC: {auc_roc:.4f}")
print(f"‚úì Test F1-Score: {f1:.4f}")
print(f"‚úì Test Recall: {recall:.4f}")
print(f"‚úì Test Precision: {precision:.4f}")
print("=" * 80)

# Display sample results
print("\nüìä SAMPLE CHURN RISK PREDICTIONS:")
print("-" * 80)
sample_df = df[['customer_id', 'churn', 'churn_risk']].head(10)
sample_df['risk_category'] = pd.cut(sample_df['churn_risk'],
                                     bins=[0, 0.3, 0.7, 1.0],
                                     labels=['Low', 'Medium', 'High'])
print(sample_df.to_string(index=False))
print("=" * 80)


PIPELINE COMPLETED SUCCESSFULLY!
‚úì Dataset processed: 10000 customers
‚úì Churn risk scores added to dataframe
‚úì Model saved: rf_churn_risk_model.pkl
‚úì Test Accuracy: 0.7045
‚úì Test AUC-ROC: 0.7304
‚úì Test F1-Score: 0.8041
‚úì Test Recall: 0.7045
‚úì Test Precision: 0.9603

üìä SAMPLE CHURN RISK PREDICTIONS:
--------------------------------------------------------------------------------
customer_id  churn  churn_risk risk_category
     C00001      0    0.068903           Low
     C00002      0    0.358329        Medium
     C00003      0    0.534791        Medium
     C00004      0    0.523274        Medium
     C00005      0    0.387922        Medium
     C00006      0    0.456309        Medium
     C00007      0    0.006380           Low
     C00008      0    0.003255           Low
     C00009      0    0.590301        Medium
     C00010      0    0.378490        Medium
