In [1]:
# Step 1:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

# Step 2:
df = pd.read_csv("/kaggle/input/customer-churn/Churn_Modelling.csv")

# remove extra cols
df = df.drop(["RowNumber", "CustomerId", "Surname"], axis=1)

# Step 3:
le = LabelEncoder()
df["Geography"] = le.fit_transform(df["Geography"])
df["Gender"] = le.fit_transform(df["Gender"])

# Step 4:
X = df.drop("Exited", axis=1)
y = df["Exited"]

# train-test split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

# scale
scaler = StandardScaler()
X_tr_s = scaler.fit_transform(X_tr)
X_te_s = scaler.transform(X_te)

# Step 5:
log_clf = LogisticRegression(max_iter=500)
log_clf.fit(X_tr_s, y_tr)
log_pred = log_clf.predict(X_te_s)
log_prob = log_clf.predict_proba(X_te_s)[:,1]

print("Logistic Regression")
print("F1:", f1_score(y_te, log_pred))
print("Acc:", accuracy_score(y_te, log_pred))
print("AUC:", roc_auc_score(y_te, log_prob))
print(classification_report(y_te, log_pred))

# Step 6:
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_tr, y_tr)
rf_pred = rf_clf.predict(X_te)
rf_prob = rf_clf.predict_proba(X_te)[:,1]

print("\n Random Forest")
print("F1:", f1_score(y_te, rf_pred))
print("Acc:", accuracy_score(y_te, rf_pred))
print("AUC:", roc_auc_score(y_te, rf_prob))
print(classification_report(y_te, rf_pred))

# Step 7:
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_tr, y_tr)
gb_pred = gb_clf.predict(X_te)
gb_prob = gb_clf.predict_proba(X_te)[:,1]

print("\nGradient Boosting")
print("F1:", f1_score(y_te, gb_pred))
print("Acc:", accuracy_score(y_te, gb_pred))
print("AUC:", roc_auc_score(y_te, gb_prob))
print(classification_report(y_te, gb_pred))

# Step 8:
results = pd.DataFrame({
    "Model": ["LogReg", "RandForest", "GradBoost"],
    "F1": [
        f1_score(y_te, log_pred),
        f1_score(y_te, rf_pred),
        f1_score(y_te, gb_pred)
    ],
    "Acc": [
        accuracy_score(y_te, log_pred),
        accuracy_score(y_te, rf_pred),
        accuracy_score(y_te, gb_pred)
    ],
    "AUC": [
        roc_auc_score(y_te, log_prob),
        roc_auc_score(y_te, rf_prob),
        roc_auc_score(y_te, gb_prob)
    ]
})

print("\nModel Comparison")
display(results)

Logistic Regression
F1: 0.27734375
Acc: 0.815
AUC: 0.763555120647422
              precision    recall  f1-score   support

           0       0.83      0.97      0.89      1607
           1       0.60      0.18      0.28       393

    accuracy                           0.81      2000
   macro avg       0.71      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000


 Random Forest
F1: 0.5745682888540031
Acc: 0.8645
AUC: 0.8569648373607199
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.47      0.57       393

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000


Gradient Boosting
F1: 0.5777080062794349
Acc: 0.8655
AUC: 0.8707420303348422
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1   

Unnamed: 0,Model,F1,Acc,AUC
0,LogReg,0.277344,0.815,0.763555
1,RandForest,0.574568,0.8645,0.856965
2,GradBoost,0.577708,0.8655,0.870742
