0) Libraries

In [64]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

1) Download and read the data from this link:  
https://www.kaggle.com/datasets/mathchi/churn-for-bank-customers

*Main function: pd.read_csv*


In [65]:
dataset = pd.read_csv('churn.csv')
dataset.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [66]:
dataset.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

2) Convert categorical variables (Geography,Gender) to Numerical 

*Main function: label encoding*

In [67]:
label_encoder = preprocessing.LabelEncoder()

In [68]:
for i in ['Geography', 'Gender']:
    dataset[i]= label_encoder.fit_transform(dataset[i])
dataset.loc[:5 , ['Geography', 'Gender']]

Unnamed: 0,Geography,Gender
0,0,0
1,2,0
2,0,0
3,0,0
4,2,0
5,2,1


3) Split X και y, train και test

X_Columns: 
CreditScore, Geography, Gender, Tenure, Balance, NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary 

Y_columns: 
Exited

and add stratify = y στα parameters από το train_test_split

*Main function: (train_test_split)*

In [69]:
X_Columns = ['CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']

In [70]:
Y_columns= ['Exited']

In [71]:
X = dataset.copy()[X_Columns]
y = dataset.copy()[Y_columns]
print(X.shape)
print(y.shape)

(10000, 10)
(10000, 1)


In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8000, 10)
(2000, 10)
(8000, 1)
(2000, 1)


4) Train with these characteristics:

* Random Forest 
* Without PCA
* Without clusters
* With Precision scoring, 
* Try with grid search 200, 300 και 500 tress, with max depth 3,5 and 10, as well as the two avaible criterion. 

*Main function: grid_search, RandomForestClassifier*

In [73]:
model_rf_class =RandomForestClassifier()

In [74]:
param_grid_class = {'n_estimators': [200,300,500],
              'max_depth' : [3,5,10],
              'criterion' :['gini', 'entropy']
             }

In [75]:
grid_search_rf_class = GridSearchCV(estimator=model_rf_class, 
                              param_grid=param_grid_class, 
                              scoring= 'accuracy',
                              cv=5,
                             verbose=1)

In [76]:
grid_search_rf_class.fit(X_train,y_train.values.ravel())

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [77]:
grid_search_rf_class.best_params_

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 300}

5) Export from the model random forest a data frame with the feature importance with the best model on each cluster. 

*Main function: feature_importance_*

In [78]:
pd.DataFrame(data = grid_search_rf_class.best_estimator_.feature_importances_,
            index = X.columns,
            columns=['feature_importance']).sort_values(by='feature_importance',
                                                       ascending =False)

Unnamed: 0,feature_importance
Age,0.315458
NumOfProducts,0.230743
Balance,0.103257
EstimatedSalary,0.083847
CreditScore,0.083247
IsActiveMember,0.069857
Tenure,0.045136
Geography,0.037941
Gender,0.019484
HasCrCard,0.011029


6) Train with these characteristics:
* Algorithm Support Vector Machine
* With Standard Scaler
* With scoring Balanced Accuracy
* Try only with the kernel Radial basis function, to find gamma automatically,  while  the parameter C takes the values 0.001, 0.01, 1, 10 ,100 ,1000

*Main function: grid_search, SVC(class_weight= 'balanced'), StandarScaler, balanced_accuracy*


In [79]:
scaler = StandardScaler()

In [80]:
scaler.fit(X_train)

In [81]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [82]:
X_train_scaled = pd.DataFrame(X_train_scaled,columns = X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled,columns = X.columns) 

In [83]:
model_SVC = SVC(class_weight='balanced')

In [84]:
tuned_parameters = [{'kernel': ['rbf'], 
                     'gamma': ["auto"],
                     'C': [0.001, 0.01, 1, 10, 100, 1000]}
                   ]

In [85]:
model_SVC_grid = GridSearchCV(estimator  =  model_SVC,
                              param_grid = tuned_parameters,
                              scoring="balanced_accuracy",
                              cv=5,
                              verbose = True
                             )

In [89]:
model_SVC_grid = model_SVC_grid.fit(X_train_scaled, y_train.values.ravel())  #DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [90]:
model_SVC_grid.best_params_

{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}

In [91]:
pd.DataFrame(model_SVC_grid.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,1.677643,0.190438,0.724687,0.089539,1.0,auto,rbf,"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}",0.772977,0.782148,0.793525,0.751789,0.770194,0.774127,0.01383,1
3,2.037553,0.1689,0.599947,0.046443,10.0,auto,rbf,"{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}",0.76613,0.756111,0.7748,0.755037,0.766057,0.763627,0.007311,2
1,2.292081,0.069032,1.068798,0.061419,0.01,auto,rbf,"{'C': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}",0.748428,0.730565,0.736194,0.72207,0.720464,0.731544,0.0102,3
4,3.764014,0.096309,0.510827,0.059396,100.0,auto,rbf,"{'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}",0.735491,0.720014,0.71659,0.724579,0.741094,0.727553,0.009298,4
5,10.639448,0.816354,0.467025,0.082836,1000.0,auto,rbf,"{'C': 1000, 'gamma': 'auto', 'kernel': 'rbf'}",0.700468,0.711673,0.669581,0.710636,0.706783,0.699828,0.015626,5
0,2.752023,0.403727,1.279013,0.154224,0.001,auto,rbf,"{'C': 0.001, 'gamma': 'auto', 'kernel': 'rbf'}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,6


7) Predict the two models from the test sample, then export Confusion Matrix, Accuracy and Balanced_Accuracy.

*Main function: predict, confusion_matrix, accuracy, balanced_accuracy*


**Random Forest**

In [92]:
predictions_rf_class = grid_search_rf_class.predict(X_test)

In [93]:
confusion_matrix(y_test,predictions_rf_class)

array([[1540,   53],
       [ 229,  178]], dtype=int64)

In [94]:
accuracy_score(y_test,predictions_rf_class)

0.859

In [95]:
balanced_accuracy_score(y_test,predictions_rf_class)

0.702037939326075

**Support Vector Machine**

In [96]:
y_predictions_SVC = model_SVC_grid.predict(X_test_scaled)

In [97]:
confusion_matrix(y_test, y_predictions_SVC)

array([[1253,  340],
       [ 110,  297]], dtype=int64)

In [98]:
accuracy_score(y_test, y_predictions_SVC)

0.775

In [99]:
balanced_accuracy_score(y_test, y_predictions_SVC)

0.7581479784869616