0) Libraries

In [19]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

1) Download and read the data from this link:  
https://www.kaggle.com/datasets/mathchi/churn-for-bank-customers

*Main function: pd.read_csv*


In [20]:
dataset = pd.read_csv('churn.csv')
dataset.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [21]:
dataset.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

2) Convert categorical variables (Geography,Gender) to Numerical 

*Main function: label encoding*

In [22]:
label_encoder = preprocessing.LabelEncoder()

In [23]:
for i in ['Geography', 'Gender']:
    dataset[i]= label_encoder.fit_transform(dataset[i])
dataset.loc[:5 , ['Geography', 'Gender']]

Unnamed: 0,Geography,Gender
0,0,0
1,2,0
2,0,0
3,0,0
4,2,0
5,2,1


3) Split X και y, train και test

X_Columns: 
CreditScore, Geography, Gender, Tenure, Balance, NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary 

Y_columns: 
Exited

and add stratify = y στα parameters από το train_test_split

*Main function: (train_test_split)*

In [24]:
X_Columns = ['CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']

In [25]:
Y_columns= ['Exited']

In [26]:
X = dataset.copy()[X_Columns]
y = dataset.copy()[Y_columns]
print(X.shape)
print(y.shape)

(10000, 10)
(10000, 1)


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8000, 10)
(2000, 10)
(8000, 1)
(2000, 1)


4) Train with these characteristics:

* Random Forest 
* Without PCA
* Without clusters
* With Precision scoring, 
* Try with grid search 200, 300 και 500 tress, with max depth 3,5 and 10, as well as the two avaible criterion. 

*Main function: grid_search, RandomForestClassifier*

In [28]:
model_rf_class =RandomForestClassifier()

In [29]:
param_grid_class = {'n_estimators': [200,300,500],
              'max_depth' : [3,5,10],
              'criterion' :['gini', 'entropy']
             }

In [30]:
grid_search_rf_class = GridSearchCV(estimator=model_rf_class, 
                              param_grid=param_grid_class, 
                              scoring= 'accuracy',
                              cv=5,
                             verbose=1)

In [49]:
grid_search_rf_class.fit(X_train,y_train.values.ravel())

Fitting 5 folds for each of 18 candidates, totalling 90 fits


KeyboardInterrupt: 

In [None]:
grid_search_rf_class.best_params_

5) Export from the model random forest a data frame with the feature importance with the best model on each cluster. 

*Main function: feature_importance_*

In [None]:
pd.DataFrame(data = grid_search_rf_class.best_estimator_.feature_importances_,
            index = X.columns,
            columns=['feature_importance']).sort_values(by='feature_importance',
                                                       ascending =False)


In [None]:
model_RFC= RandomForestClassifier(n_estimators=300,max_depth=10,criterion='gini')
model_RFC.fit(X_train,y_train.values.ravel())

In [None]:
import pickle
file_name = 'model.pkl'
pickle.dump(model_rf_class, open(file_name, 'wb'))

In [None]:
loaded_model = pickle.load(open('model.pkl', 'rb'))
#predictions_rf_class_pickled = loaded_model.predict(X_test)
#balanced_accuracy_score(y_test, y_predictions_SVC_pickled)

6) Train with these characteristics:
* Algorithm Support Vector Machine
* With Standard Scaler
* With scoring Balanced Accuracy
* Try only with the kernel Radial basis function, to find gamma automatically,  while  the parameter C takes the values 0.001, 0.01, 1, 10 ,100 ,1000

*Main function: grid_search, SVC(class_weight= 'balanced'), StandarScaler, balanced_accuracy*


In [35]:
scaler = StandardScaler()

In [36]:
scaler.fit(X_train)

In [37]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [38]:
X_train_scaled = pd.DataFrame(X_train_scaled,columns = X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled,columns = X.columns) 

In [39]:
model_SVC = SVC(class_weight='balanced')

In [40]:
tuned_parameters = [{'kernel': ['rbf'], 
                     'gamma': ["auto"],
                     'C': [0.001, 0.01, 1, 10, 100, 1000]}
                   ]

In [41]:
model_SVC_grid = GridSearchCV(estimator  =  model_SVC,
                              param_grid = tuned_parameters,
                              scoring="balanced_accuracy",
                              cv=5,
                              verbose = True
                             )

In [42]:
model_SVC_grid = model_SVC_grid.fit(X_train_scaled, y_train.values.ravel())  #DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [43]:
model_SVC_grid.best_params_

{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}

In [44]:
pd.DataFrame(model_SVC_grid.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.878271,0.016656,0.44964,0.018951,1.0,auto,rbf,"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}",0.770018,0.76613,0.762742,0.76552,0.765809,0.766044,0.002325,1
3,1.062484,0.026927,0.414374,0.016843,10.0,auto,rbf,"{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}",0.760532,0.749331,0.743304,0.751469,0.750152,0.750957,0.005546,2
1,1.286306,0.01569,0.724156,0.013936,0.01,auto,rbf,"{'C': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}",0.736086,0.734944,0.733095,0.720035,0.735017,0.731835,0.005978,3
4,2.314275,0.085452,0.352429,0.014028,100.0,auto,rbf,"{'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}",0.703499,0.719012,0.705281,0.702781,0.713058,0.708726,0.00631,4
5,8.760769,1.021839,0.373863,0.097716,1000.0,auto,rbf,"{'C': 1000, 'gamma': 'auto', 'kernel': 'rbf'}",0.684061,0.673289,0.709778,0.676532,0.690733,0.686879,0.012953,5
0,1.730292,0.380611,1.056104,0.291034,0.001,auto,rbf,"{'C': 0.001, 'gamma': 'auto', 'kernel': 'rbf'}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,6


In [45]:
model_SVC_best= SVC(C=1,kernel='rbf', gamma='auto',class_weight='balanced')
model_SVC_best.fit(X_train_scaled, y_train.values.ravel())

In [46]:
import pickle
file_name = 'modelSVC.pkl'
pickle.dump(model_SVC_best, open(file_name, 'wb'))

In [47]:
#loaded_model2 = pickle.load(open('modelSVC.pkl', 'rb'))
#y_predictions_SVC_pickled = loaded_model2.predict(X_test_scaled)
#balanced_accuracy_score(y_test, y_predictions_SVC_pickled)

7) Predict the two models from the test sample, then export Confusion Matrix, Accuracy and Balanced_Accuracy.

*Main function: predict, confusion_matrix, accuracy, balanced_accuracy*


**Random Forest**

In [None]:
predictions_rf_class = grid_search_rf_class.predict(X_test)

In [None]:
confusion_matrix(y_test,predictions_rf_class)

In [None]:
accuracy_score(y_test,predictions_rf_class)

In [None]:
balanced_accuracy_score(y_test,predictions_rf_class)

**Support Vector Machine**

In [None]:
y_predictions_SVC = model_SVC_grid.predict(X_test_scaled)

In [None]:
confusion_matrix(y_test, y_predictions_SVC)

In [None]:
accuracy_score(y_test, y_predictions_SVC)

In [None]:
balanced_accuracy_score(y_test, y_predictions_SVC)