# 1. Packages


In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss
from sklearn.preprocessing import StandardScaler

# 2. Dataset

In [3]:
df = pd.read_csv(r'C:\Users\valen\OneDrive\Escritorio\Juano_VS\Beta-Bank\Data\Churn.csv')
df.columns =df.columns.str.lower()
df = df.drop (['rownumber', 'customerid', 'surname'], axis=1)
print (df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   creditscore      10000 non-null  int64  
 1   geography        10000 non-null  object 
 2   gender           10000 non-null  object 
 3   age              10000 non-null  int64  
 4   tenure           9091 non-null   float64
 5   balance          10000 non-null  float64
 6   numofproducts    10000 non-null  int64  
 7   hascrcard        10000 non-null  int64  
 8   isactivemember   10000 non-null  int64  
 9   estimatedsalary  10000 non-null  float64
 10  exited           10000 non-null  int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 859.5+ KB
None


In [4]:
df[df['tenure']==0].shape

(382, 11)

In [5]:
median = df['tenure'].median()
print(median)

5.0


In [6]:
df['tenure'] = df['tenure'].fillna(median)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   creditscore      10000 non-null  int64  
 1   geography        10000 non-null  object 
 2   gender           10000 non-null  object 
 3   age              10000 non-null  int64  
 4   tenure           10000 non-null  float64
 5   balance          10000 non-null  float64
 6   numofproducts    10000 non-null  int64  
 7   hascrcard        10000 non-null  int64  
 8   isactivemember   10000 non-null  int64  
 9   estimatedsalary  10000 non-null  float64
 10  exited           10000 non-null  int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 859.5+ KB


In [7]:
print (df.duplicated().sum())

0


In [8]:
df_ohe = pd.get_dummies (df, columns = ['geography', 'gender'], drop_first=True, dtype = int)
df_ohe

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited,geography_Germany,geography_Spain,gender_Male
0,619,42,2.0,0.00,1,1,1,101348.88,1,0,0,0
1,608,41,1.0,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8.0,159660.80,3,1,0,113931.57,1,0,0,0
3,699,39,1.0,0.00,2,0,0,93826.63,0,0,0,0
4,850,43,2.0,125510.82,1,1,1,79084.10,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,39,5.0,0.00,2,1,0,96270.64,0,0,0,1
9996,516,35,10.0,57369.61,1,1,1,101699.77,0,0,0,1
9997,709,36,7.0,0.00,1,0,1,42085.58,1,0,0,0
9998,772,42,3.0,75075.31,2,1,0,92888.52,1,1,0,1


In [9]:
X = df_ohe.drop('exited', axis=1)
y = df_ohe['exited']

x_train, x_test, y_train, y_test = train_test_split (X, y, test_size = 0.2, random_state = 42, stratify = y)

print (pd.Series(y_train).value_counts(1))
print (pd.Series(y_test).value_counts(1))

exited
0    0.79625
1    0.20375
Name: proportion, dtype: float64
exited
0    0.7965
1    0.2035
Name: proportion, dtype: float64


In [10]:
def model_select (estimator, param, features_train, target_train, features_test, target_test): 
    cv = StratifiedKFold (n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param,
        cv=cv, 
        scoring='roc_auc',
        refit=True,
        n_jobs=-1)
    grid_search.fit (features_train, target_train)
    print (f'Best Hyperparameters Cross-Validation: {grid_search.best_params_}')
    print (f'Best Score Cross-Validation (ROC AUC): {grid_search.best_score_:4f}')
    best_model = grid_search.best_estimator_
    predictions = best_model.predict (features_test)
    probs = best_model.predict_proba (features_test)[:,1]
    print (f'F1 Score Test: {f1_score (target_test, predictions):.4f}')
    print (f'ROC AUC Score Test: {roc_auc_score (target_test, probs):.4f}')
    return best_model

In [11]:
tree = DecisionTreeClassifier (random_state=42)
param_grid = {
    'max_depth': [3,5,7,10], # Controls complexity
    'min_samples_leaf': [20,50,100], # Specific Rules vs General Rules
    'criterion': ['gini', 'entropy'] # Math behind splits
} 

In [12]:
model_baseline = model_select (tree, param_grid, x_train, y_train, x_test, y_test)

Best Hyperparameters Cross-Validation: {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 20}
Best Score Cross-Validation (ROC AUC): 0.839260
F1 Score Test: 0.6020
ROC AUC Score Test: 0.8441


In [13]:
nm = NearMiss(version=1)
x_train_nm, y_train_nm = nm.fit_resample(x_train, y_train)
tree_nm = model_select (tree, param_grid, x_train_nm, y_train_nm, x_test, y_test)

Best Hyperparameters Cross-Validation: {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 50}
Best Score Cross-Validation (ROC AUC): 0.970800
F1 Score Test: 0.3995
ROC AUC Score Test: 0.6374


In [14]:
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

model_smote = model_select (tree, param_grid, x_train_smote, y_train_smote, x_test, y_test)

Best Hyperparameters Cross-Validation: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 20}
Best Score Cross-Validation (ROC AUC): 0.887121
F1 Score Test: 0.5753
ROC AUC Score Test: 0.8172


In [15]:
forest = RandomForestClassifier(random_state=42)
param_grid_forest = {
    'n_estimators': [20, 50,100,200,300,400],
    'max_depth': [10, 20, 30, 40],
    'min_samples_split': [2,5,10]
}

best_rf_base = model_select(forest, param_grid_forest, x_train, y_train, x_test, y_test)

Best Hyperparameters Cross-Validation: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 400}
Best Score Cross-Validation (ROC AUC): 0.859483
F1 Score Test: 0.5877
ROC AUC Score Test: 0.8650


In [16]:
best_rf_nm = model_select(forest, param_grid_forest,x_train_nm, y_train_nm, x_test, y_test)

Best Hyperparameters Cross-Validation: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Best Score Cross-Validation (ROC AUC): 0.973573
F1 Score Test: 0.3989
ROC AUC Score Test: 0.6912


In [17]:
best_rf_smote = model_select(forest, param_grid_forest, x_train_smote, y_train_smote, x_test, y_test)

Best Hyperparameters Cross-Validation: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 400}
Best Score Cross-Validation (ROC AUC): 0.936938
F1 Score Test: 0.5780
ROC AUC Score Test: 0.8405


In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
x_train_nm_scaled = scaler.transform(x_train_nm)
x_train_smote_scaled = scaler.transform (x_train_smote)

In [24]:
log_reg = LogisticRegression(random_state=42, max_iter=4000)
param_grid_lr = [
    # Option A: L2 penalty (Standard) - Works with default lbfgs
    {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10, 100],
        'class_weight': ['balanced', None],
        'solver': ['lbfgs'] 
    },
    # Option B: L1 penalty (Feature Selection) - Needs liblinear
    {
        'penalty': ['l1'],
        'C': [0.01, 0.1, 1, 10, 100],
        'class_weight': ['balanced', None],
        'solver': ['liblinear'] 
    }
]

In [27]:
best_lr_base = model_select (log_reg, param_grid_lr, x_train_scaled,y_train, x_test_scaled,y_test)

Best Hyperparameters Cross-Validation: {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
Best Score Cross-Validation (ROC AUC): 0.767278
F1 Score Test: 0.5042
ROC AUC Score Test: 0.7805


In [29]:
best_lr_nm = model_select (log_reg, param_grid_lr, x_train_nm_scaled, y_train_nm, x_test_scaled, y_test)

Best Hyperparameters Cross-Validation: {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
Best Score Cross-Validation (ROC AUC): 0.940032
F1 Score Test: 0.3748
ROC AUC Score Test: 0.6429


In [31]:
best_lr_smote = model_select(log_reg, param_grid_lr, x_train_smote_scaled, y_train_smote, x_test_scaled, y_test)

Best Hyperparameters Cross-Validation: {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score Cross-Validation (ROC AUC): 0.855555
F1 Score Test: 0.4632
ROC AUC Score Test: 0.7408
