In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import minmax_scale, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
bank_df = pd.read_csv('/Users/navyatrilok/Documents/DATASCIENCE/Datasets/Bank_churn.csv')
bank_df.shape

(165034, 14)

In [3]:
bank_df

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,,Male,33.0,3,0.00,2,1,0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1,1,49503.50,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1,0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1,1,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1,1,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1,1,131834.75,0
165030,165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0,0,131834.45,0
165031,165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1,1,127429.56,0
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0,1,71173.03,0


# Building Preprocessing pipeline

In [4]:
bank_df.drop(labels=['id','CustomerId','Surname'],axis=1,inplace=True)
bank_df.shape

(165034, 11)

In [5]:
bank_df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,,Male,33.0,3,0.00,2,1,0,181449.97,0
1,627,France,Male,33.0,1,0.00,2,1,1,49503.50,0
2,678,France,Male,40.0,10,0.00,2,1,0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1,1,84560.88,0
4,716,Spain,Male,33.0,5,0.00,2,1,1,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...
165029,667,Spain,Female,33.0,2,0.00,1,1,1,131834.75,0
165030,792,France,Male,35.0,3,0.00,1,0,0,131834.45,0
165031,565,France,Male,31.0,5,0.00,1,1,1,127429.56,0
165032,554,Spain,Female,30.0,7,161533.00,1,0,1,71173.03,0


In [6]:
cat_col = [1,2]
numeric_cols = [0,3,4,5,6,7,8,9] #leave target column

1. Imputing the missing values with mean for continues data

2. Imputing the misisng values with mode for categorical data

3. Encoding categorical values with ordinal encoder

4. Scaling values with minmax scale

In [7]:
numeric_transformers = Pipeline(steps=[("imputer",SimpleImputer(strategy='mean')),
                                      ("scaling",MinMaxScaler())])
cat_transformers = Pipeline(steps = [("imputer",SimpleImputer(strategy="most_frequent")),
                                    ("encoding",OrdinalEncoder())])

preproc_pipe = ColumnTransformer(transformers=[("num",numeric_transformers,numeric_cols),
                                              ("cat",cat_transformers,cat_col)],
                                remainder='passthrough')

In [8]:
preproc_pipe

# Building Training Pipeline

In [9]:
bank_df = pd.DataFrame(preproc_pipe.fit_transform(bank_df),columns=bank_df.columns)
bank_df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0.636,0.202703,0.3,0.000000,0.333333,1.0,0.0,0.907279,0.0,1.0,0.0
1,0.554,0.202703,0.1,0.000000,0.333333,1.0,1.0,0.247483,0.0,1.0,0.0
2,0.656,0.297297,1.0,0.000000,0.333333,1.0,0.0,0.924364,0.0,1.0,0.0
3,0.462,0.216216,0.2,0.593398,0.000000,1.0,1.0,0.422787,0.0,1.0,0.0
4,0.732,0.202703,0.5,0.000000,0.333333,1.0,1.0,0.075293,2.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
165029,0.634,0.202703,0.2,0.000000,0.000000,1.0,1.0,0.659179,2.0,0.0,0.0
165030,0.884,0.229730,0.3,0.000000,0.000000,0.0,0.0,0.659177,0.0,1.0,0.0
165031,0.430,0.175676,0.5,0.000000,0.000000,1.0,1.0,0.637151,0.0,1.0,0.0
165032,0.408,0.162162,0.7,0.643819,0.000000,0.0,1.0,0.355841,2.0,0.0,0.0


In [10]:
X = bank_df.iloc[:,:-1]
X.shape

(165034, 10)

In [11]:
y = bank_df.iloc[:,-1]
y.shape

(165034,)

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)
X_train.shape,X_test.shape

((115523, 10), (49511, 10))

In [13]:
kbest = SelectKBest(chi2,k='all')

In [14]:
model = RandomForestClassifier(n_estimators=10,criterion='entropy',max_samples=75000)

In [15]:
train_pipe = Pipeline([("kbest",kbest),
                      ("rfmodel",model)
                      ])

train_pipe.fit(X_train,y_train)
train_pipe.score(X_test,y_test)

0.8513057704348529

In [16]:
train_pipe

In [17]:
preproc_pipe

In [19]:
preproc_train_pipe = Pipeline([("pre_proc",preproc_pipe),
                         ("train",train_pipe)])
preproc_train_pipe

In [20]:
complete_pipe.fit(X_train,y_train)

  mode = stats.mode(array)


In [21]:
complete_pipe.score(X_test,y_test)

0.8515077457534689

# Building model evaluation pipe

In [22]:
y_test

32039     0.0
103384    0.0
90532     0.0
142621    0.0
71986     0.0
         ... 
76622     0.0
80693     0.0
118068    0.0
54233     0.0
113362    0.0
Name: Exited, Length: 49511, dtype: float64

In [23]:
y_predict = model.predict(X_test)



In [24]:

metrics = {
    
"accuracy_score":accuracy_score(y_test,y_predict),
"precision_score":precision_score(y_test,y_predict,pos_label=1),
"recall_score":recall_score(y_test,y_predict,pos_label=1),
"cohen_kappa_score":cohen_kappa_score(y_test,y_predict),
"f1_score": f1_score(y_test,y_predict,pos_label=1)
    
}

In [25]:
evaluation_pipe = Pipeline([("metrics",metrics)])

In [26]:
complete_pipe = Pipeline([("pre_proc",preproc_pipe),
                         ("train",train_pipe),
                         ("evaluation",evaluation_pipe)])
complete_pipe