In [1]:
### check in the correct python environment
import sys
print(sys.executable)

/opt/anaconda3/envs/churn_rf/bin/python


In [2]:
### import libraries
import numpy as np; np.set_printoptions(precision=2)
import pandas as pd; pd.options.display.float_format = "{:,.2f}".format
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from matplotlib import pyplot as plt

In [3]:
### load data
churn = pd.read_csv('/Users/weijiaying/Desktop/IMT574-Project/Churn_Modelling.csv')
churn.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:

# drop any non-predictive features 
churn_X = churn.drop(columns=[
    "RowNumber", 
    "CustomerId", 
    "Surname", 
    "Exited"
])

churn_y = churn["Exited"]

In [5]:
# encode dummy variables 
churn_X = pd.get_dummies(
    churn_X,
    columns=["Geography", "Gender"],
    drop_first=True
)

In [6]:
# split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    churn_X,
    churn_y,
    test_size=0.2,
    random_state=42
)

In [7]:
# train a random forest classifier
np.random.seed(574)
rf = RandomForestClassifier(
    n_estimators=1000,       # number of trees
    max_depth=10,           # depth of each tree
    min_samples_leaf=20,    # minimum samples per leaf node
    max_features="sqrt",    # random feature selection
    class_weight="balanced", # maintain class balance
    random_state=42 
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
y_prob = rf.predict_proba(X_val)[:, 1]

In [8]:
print(classification_report(y_val, y_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_prob))

              precision    recall  f1-score   support

           0       0.94      0.84      0.89      1607
           1       0.54      0.77      0.64       393

    accuracy                           0.83      2000
   macro avg       0.74      0.80      0.76      2000
weighted avg       0.86      0.83      0.84      2000

ROC-AUC: 0.8679663241765115


### Reference

Random Forest Algorithm in Machine Learning: https://www.geeksforgeeks.org/machine-learning/random-forest-algorithm-in-machine-learning/




AUC ROC Curve in Machine Learning: https://www.geeksforgeeks.org/machine-learning/auc-roc-curve/