In [81]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import os
import pickle
import json
import warnings
warnings.filterwarnings("ignore")

### Data Gathering

In [82]:
df = pd.read_csv("static/csv_files/bank.csv")
df

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   credit_score      10000 non-null  int64  
 1   country           10000 non-null  object 
 2   gender            10000 non-null  object 
 3   age               10000 non-null  int64  
 4   tenure            10000 non-null  int64  
 5   balance           10000 non-null  float64
 6   products_number   10000 non-null  int64  
 7   credit_card       10000 non-null  int64  
 8   active_member     10000 non-null  int64  
 9   estimated_salary  10000 non-null  float64
 10  churn             10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


### EDA

In [84]:
df["country"].value_counts().to_dict()

{'France': 5014, 'Germany': 2509, 'Spain': 2477}

In [85]:
df = pd.get_dummies(df,columns = ["country"],dtype = int)
df

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_France,country_Germany,country_Spain
0,619,Female,42,2,0.00,1,1,1,101348.88,1,1,0,0
1,608,Female,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,Female,42,8,159660.80,3,1,0,113931.57,1,1,0,0
3,699,Female,39,1,0.00,2,0,0,93826.63,0,1,0,0
4,850,Female,43,2,125510.82,1,1,1,79084.10,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,Male,39,5,0.00,2,1,0,96270.64,0,1,0,0
9996,516,Male,35,10,57369.61,1,1,1,101699.77,0,1,0,0
9997,709,Female,36,7,0.00,1,0,1,42085.58,1,1,0,0
9998,772,Male,42,3,75075.31,2,1,0,92888.52,1,0,1,0


In [86]:
df["gender"].value_counts().to_dict()

{'Male': 5457, 'Female': 4543}

In [87]:
df.replace({'Male': 0, 'Female': 1},inplace = True)

In [88]:
gender_data = {'Male': 0, 'Female': 1}

### Scaling

In [89]:
x = df.drop("churn",axis = 1)
y = df["churn"]
std_scaler = StandardScaler()

std_scaler.fit(x)
array = std_scaler.transform(x)
x_df = pd.DataFrame(array, columns= x.columns)
x_df

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,country_France,country_Germany,country_Spain
0,-0.326221,1.095988,0.293517,-1.041760,-1.225848,-0.911583,0.646092,0.970243,0.021886,0.997204,-0.578736,-0.573809
1,-0.440036,1.095988,0.198164,-1.387538,0.117350,-0.911583,-1.547768,0.970243,0.216534,-1.002804,-0.578736,1.742740
2,-1.536794,1.095988,0.293517,1.032908,1.333053,2.527057,0.646092,-1.030670,0.240687,0.997204,-0.578736,-0.573809
3,0.501521,1.095988,0.007457,-1.387538,-1.225848,0.807737,-1.547768,-1.030670,-0.108918,0.997204,-0.578736,-0.573809
4,2.063884,1.095988,0.388871,-1.041760,0.785728,-0.911583,0.646092,0.970243,-0.365276,-1.002804,-0.578736,1.742740
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.246488,-0.912419,0.007457,-0.004426,-1.225848,0.807737,0.646092,-1.030670,-0.066419,0.997204,-0.578736,-0.573809
9996,-1.391939,-0.912419,-0.373958,1.724464,-0.306379,-0.911583,0.646092,0.970243,0.027988,0.997204,-0.578736,-0.573809
9997,0.604988,1.095988,-0.278604,0.687130,-1.225848,-0.911583,-1.547768,0.970243,-1.008643,0.997204,-0.578736,-0.573809
9998,1.256835,-0.912419,0.293517,-0.695982,-0.022608,0.807737,0.646092,-1.030670,-0.125231,-1.002804,1.727904,-0.573809


### Model Training

### Knn classifier

In [90]:
y = df['churn']
x = df.drop("churn",axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x_df,y, test_size=0.2, random_state=1, stratify=y)

In [91]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train, y_train)

### Model Evaluation

In [92]:
y_pred = knn_clf.predict(x_test)

acc = accuracy_score(y_pred,y_test)
print("accuracy :", acc)


accuracy : 0.829


In [93]:
y_pred_train = knn_clf.predict(x_train)

acc = accuracy_score(y_pred_train,y_train)
print("accuracy :", acc)


accuracy : 0.875125


In [94]:
knn_clf = KNeighborsClassifier()
param_grid = {"n_neighbors" : np.arange(3,10),
              "p" : [1,2] }

gscv_knn_clf = GridSearchCV(knn_clf, param_grid, cv = 5)
gscv_knn_clf.fit(x_train, y_train)

In [72]:
gscv_knn_clf.best_estimator_

#### Hyperparameter Tuning

In [96]:
knn_clf = KNeighborsClassifier(n_neighbors=8, p=1)
knn_clf.fit(x_train, y_train)

In [97]:
y_pred_train = knn_clf.predict(x_train)

acc = accuracy_score(y_pred_train,y_train)
print("accuracy :", acc)


accuracy : 0.854625


In [98]:
y_pred = knn_clf.predict(x_test)

acc = accuracy_score(y_pred,y_test)
print("accuracy :", acc)

accuracy : 0.8295


### Decision tree

In [99]:
y = df['churn']
x_train, x_test, y_train, y_test = train_test_split(x_df,y, test_size=0.2, random_state=1, stratify=y)

In [100]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)

In [101]:
y_pred = dt_clf.predict(x_test)

acc = accuracy_score(y_pred,y_test)
print("accuracy :", acc)

accuracy : 0.7865


In [102]:
y_pred_train = dt_clf.predict(x_train)

acc = accuracy_score(y_pred_train,y_train)
print("accuracy :", acc)

accuracy : 1.0


#### Hyperparameter Tuning

In [103]:
hyp_grid = {"criterion" : ['gini','entropy'],
            "max_depth" : np.arange(3,10),
            "min_samples_split" : np.arange(5,20),
            "min_samples_leaf" : np.arange(2,10)}

dt_clf = DecisionTreeClassifier(random_state=10)

rscv_dt_clf = RandomizedSearchCV(dt_clf, hyp_grid, scoring=None,cv = 4, n_jobs=-1) 
rscv_dt_clf.fit(x_train, y_train)
rscv_dt_clf.best_estimator_

In [104]:
dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=9,
                       min_samples_split=18, random_state=10)
dt_clf.fit(x_train, y_train)

In [105]:
y_pred = dt_clf.predict(x_test)

acc = accuracy_score(y_pred,y_test)
print("accuracy :", acc)


accuracy : 0.8585


In [106]:
y_pred_train= dt_clf.predict(x_train)

acc = accuracy_score(y_pred_train,y_train)
print("accuracy :", acc)


accuracy : 0.867125


### Logistic Regression

In [107]:
y = df['churn']
x_train, x_test, y_train, y_test = train_test_split(x_df,y, test_size=0.2, random_state=1, stratify=y)

In [117]:
lr = LogisticRegression(random_state = 10)
lr.fit(x_train, y_train)

In [118]:
y_pred = lr.predict(x_test)

acc = accuracy_score(y_pred,y_test)
print("accuracy :", acc)

accuracy : 0.8095


In [119]:
y_pred_train = lr.predict(x_train)

acc = accuracy_score(y_pred_train,y_train)
print("accuracy :", acc)

accuracy : 0.81125


### Random Forest

In [111]:
y = df['churn']
x_train, x_test, y_train, y_test = train_test_split(x_df,y, test_size=0.2, random_state=1, stratify=y)

In [112]:
rf_clf = RandomForestClassifier()
rf_clf.fit(x_train, y_train)

In [120]:
y_pred = rf_clf.predict(x_test)

acc = accuracy_score(y_pred,y_test)
print("accuracy :", acc)

accuracy : 0.863


In [121]:
y_pred_train = rf_clf.predict(x_train)

acc = accuracy_score(y_pred_train,y_train)
print("accuracy :", acc)

accuracy : 1.0


In [126]:
rf_clf = RandomForestClassifier(random_state=2, oob_score=False, n_jobs=-1 )
hyp_grid = {"n_estimators" : np.arange(20,150),
        "criterion"   : ['gini','entropy'],
        "max_depth"   : np.arange(3,10),
        "min_samples_split" : np.arange(5,15),
        "min_samples_leaf"  : np.arange(3,10),
        "max_features"      : ['sqrt']}


rscv_rf_clf = RandomizedSearchCV(rf_clf,hyp_grid, cv =5, n_jobs=-1 )
rscv_rf_clf.fit(x_train, y_train)
rscv_rf_clf.best_estimator_

In [127]:
rf_clf = RandomForestClassifier(criterion='entropy', max_depth=9, min_samples_leaf=3,
                       min_samples_split=8, n_estimators=109, n_jobs=-1,
                       random_state=2)
rf_clf.fit(x_train, y_train)

In [128]:
y_pred = rf_clf.predict(x_test)

acc = accuracy_score(y_pred,y_test)
print("accuracy :", acc)

accuracy : 0.867


In [129]:
y_pred_train = rf_clf.predict(x_train)

acc = accuracy_score(y_pred_train,y_train)
print("accuracy :", acc)

accuracy : 0.881875


In [115]:
column_names = x.columns.tolist()
column_names

['credit_score',
 'gender',
 'age',
 'tenure',
 'balance',
 'products_number',
 'credit_card',
 'active_member',
 'estimated_salary',
 'country_France',
 'country_Germany',
 'country_Spain']

In [132]:
credit_score    = 608
gender          = "Female"
age             = 41
tenure          = 1
balance         = 83807.86
products_number = 1
credit_card     = 0
active_member   = 1
estimated_salary= 112542.58
country         = "Spain"

gender = gender_data[gender]
country = 'country_'+country

country_index = column_names.index(country)
country_index

test_array = np.zeros([1,rf_clf.n_features_in_])
test_array[0,0] = credit_score
test_array[0,1] = gender
test_array[0,2] = age
test_array[0,3] = tenure
test_array[0,4] = balance
test_array[0,5] = products_number
test_array[0,6] = credit_card
test_array[0,7] = active_member
test_array[0,8] = estimated_salary
test_array[0,9] = gender
test_array[0,country_index] = 1

std_test_array = std_scaler.transform(test_array)


rf_clf.predict(std_test_array)[0]

0

In [133]:
with open('Random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_clf, f)

In [134]:
with open('std_scaler.pkl', 'wb') as f:
    pickle.dump(std_scaler, f)

In [136]:
project_data = {
                "Gender" : gender_data,
                "Column Names" : column_names}

with open('proj_data.json','w') as f:
    json.dump(project_data, f)