In [None]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/Customer_Churn_Prediction')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from error_analysis import classification_summary
from sklearn.ensemble import RandomForestClassifier
from split import split_data
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Customer_Churn_Prediction/Telco-Customer-Churn-Final.csv")

In [None]:
x = df.drop(columns="Churn", errors="ignore")
y = df["Churn"].astype(int)

In [None]:
x_train, x_val, x_test, y_train, y_val, y_test = split_data(x,y)

In [None]:
x_train

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,InternetService,OnlineSecurity,TechSupport,PaperlessBilling,MonthlyCharges,Contract_Risk,PaymentMethod_Electronic check,PaymentMethod_Mailed check,PaymentMethod_Automatic
4541,1,0,0,1,1,0,0,0,50.550000,2,0,1,0
5709,0,0,0,12,0,0,0,0,19.300000,1,0,1,0
7393,1,0,0,1,2,0,0,1,74.469722,2,1,0,0
10026,0,0,0,4,2,0,0,1,93.753903,2,1,0,0
3578,0,0,0,40,1,1,0,0,65.100000,2,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0,1,1,72,1,1,1,1,89.400000,0,0,1,0
5191,0,1,1,23,1,1,1,1,91.100000,0,0,0,1
5390,1,0,0,12,2,0,0,1,99.450000,2,1,0,0
860,0,0,0,26,0,0,0,0,19.800000,1,0,0,1


In [None]:
def to_array(X):
    if hasattr(X, "to_numpy"):
        X = X.to_numpy()
    if hasattr(X, "toarray"):
        X = X.toarray()
    return X

xtr = to_array(x_train)
xva = to_array(x_val)

ytr = y_train.to_numpy() if hasattr(y_train, "to_numpy") else np.asarray(y_train)
yva = y_val.to_numpy() if hasattr(y_val, "to_numpy") else np.asarray(y_val)

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=5,
    min_samples_split=70,
    min_samples_leaf=50,
    max_features='sqrt',
    max_leaf_nodes=20,
    class_weight="balanced_subsample",
    n_jobs=-1
)
_ = rf.fit(xtr, ytr)

In [None]:
val_prob = rf.predict_proba(xva)[:, 1]
val_pred = rf.predict(xva)
print(classification_summary(y_val, val_pred, val_prob))

{'confusion_matrix':           Pred_0  Pred_1
Actual_0     380     122
Actual_1      82     451, 'metrics': {'accuracy': 0.8028985507246377, 'precision': 0.787085514834206, 'recall': 0.8461538461538461, 'f1': 0.8155515370705244, 'roc_auc': np.float64(0.8764715995305831)}, 'error_breakdown': {'TP': np.int64(451), 'FP': np.int64(122), 'FN': np.int64(82), 'TN': np.int64(380)}, 'classification_report': '              precision    recall  f1-score   support\n\n           0       0.82      0.76      0.79       502\n           1       0.79      0.85      0.82       533\n\n    accuracy                           0.80      1035\n   macro avg       0.80      0.80      0.80      1035\nweighted avg       0.80      0.80      0.80      1035\n'}


In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve, f1_score

probs = rf.predict_proba(x_val)[:, 1]

prec, rec, thr = precision_recall_curve(y_val, probs)
f1s = 2 * prec * rec / (prec + rec + 1e-12)

best_idx = np.argmax(f1s)
best_thr = thr[best_idx-1] if best_idx > 0 else 0.5

pred_best = (probs >= best_thr).astype(int)

print("Best threshold:", best_thr)
print("Best F1:", f1s[best_idx])
print("F1 @ best threshold:", f1_score(y_val, pred_best))

Best threshold: 0.4685330305900662
Best F1: 0.8250883392221163
F1 @ best threshold: 0.824360105913504




In [None]:
if isinstance(x_train, pd.DataFrame):
    importances = pd.Series(rf.feature_importances_, index=x_train.columns)
    print("\nTop 20 feature importances:")
    print(importances.sort_values(ascending=False).head(20))
else:
    print("\nTip: If you keep X_train as a DataFrame with column names, you can print feature importances.")


Top 20 feature importances:
Contract_Risk                     0.245918
tenure                            0.154780
InternetService                   0.143870
PaymentMethod_Automatic           0.117603
OnlineSecurity                    0.082816
MonthlyCharges                    0.075257
TechSupport                       0.050103
Partner                           0.044606
Dependents                        0.042898
PaymentMethod_Mailed check        0.028142
PaymentMethod_Electronic check    0.012328
PaperlessBilling                  0.001146
SeniorCitizen                     0.000532
dtype: float64
