In [16]:
# import the datafile 
import pandas as pd
bank = pd.read_csv('./bank_churn.csv')
bank.shape

(10000, 14)

In [17]:
print(bank.head(20))

    RowNumber  CustomerId    Surname  CreditScore Geography  Gender  Age  \
0           1    15634602   Hargrave          619    France  Female   42   
1           2    15647311       Hill          608     Spain  Female   41   
2           3    15619304       Onio          502    France  Female   42   
3           4    15701354       Boni          699    France  Female   39   
4           5    15737888   Mitchell          850     Spain  Female   43   
5           6    15574012        Chu          645     Spain    Male   44   
6           7    15592531   Bartlett          822    France    Male   50   
7           8    15656148     Obinna          376   Germany  Female   29   
8           9    15792365         He          501    France    Male   44   
9          10    15592389         H?          684    France    Male   27   
10         11    15767821     Bearce          528    France    Male   31   
11         12    15737173    Andrews          497     Spain    Male   24   
12         1

In [18]:
bank.groupby(bank.Exited).count()

Unnamed: 0_level_0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
Exited,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,7963,7963,7963,7963,7963,7963,7963,7963,7963,7963,7963,7963,7963
1,2037,2037,2037,2037,2037,2037,2037,2037,2037,2037,2037,2037,2037


In [19]:
# drop features that seem irrelevant
bank = bank.drop('RowNumber',axis=1).drop('CustomerId',axis=1).drop('Surname',axis=1)

In [20]:
# drop lines with unknown (NaN) values
bank = bank.dropna()
bank.shape

(10000, 11)

In [21]:
# use one-hot encoding for gender and geography
import numpy as np
bank = pd.get_dummies(bank,columns=['Gender','Geography'])

In [22]:
bank.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Gender_Female,Gender_Male,Geography_France,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,1,0,1,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0,0,0,1
2,502,42,8,159660.8,3,1,0,113931.57,1,1,0,1,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,1,0,1,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0,0,0,1


In [32]:
# determine training set, test set and label
from sklearn.model_selection import train_test_split
X = bank.drop('Exited',axis=1)
y = bank['Exited']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

In [33]:
# build model
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [34]:
# predict label for test set
y_test2 = model.predict(X_test)

In [35]:
# determine accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test2)

0.7976666666666666

In [27]:
# determine feature importances
print(X_train.columns)
print(model.feature_importances_)

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Gender_Female', 'Gender_Male',
       'Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype='object')
[0.14189846 0.24525899 0.08368351 0.13988044 0.12383877 0.01798132
 0.04281104 0.14774467 0.00856338 0.00897691 0.01121407 0.01886766
 0.00928079]


In [28]:
# we now combine those two collections into a dataframe
imp = pd.DataFrame(model.feature_importances_,columns=['Importance'],index=X_train.columns). \
sort_values(by='Importance',ascending=False)

In [14]:
imp

Unnamed: 0,Importance
Age,0.236765
CreditScore,0.142251
EstimatedSalary,0.141392
Balance,0.139053
NumOfProducts,0.127299
Tenure,0.087609
IsActiveMember,0.044512
Geography_Germany,0.021731
HasCrCard,0.018381
Geography_France,0.011291


In [15]:
# import numpy as np
bank['Agecat'] = np.floor(bank['Age']/10)*10
print(bank.head())

   CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0          619   42       2       0.00              1          1   
1          608   41       1   83807.86              1          0   
2          502   42       8  159660.80              3          1   
3          699   39       1       0.00              2          0   
4          850   43       2  125510.82              1          1   

   IsActiveMember  EstimatedSalary  Exited  Gender_Female  Gender_Male  \
0               1        101348.88       1              1            0   
1               1        112542.58       0              1            0   
2               0        113931.57       1              1            0   
3               0         93826.63       0              1            0   
4               1         79084.10       0              1            0   

   Geography_France  Geography_Germany  Geography_Spain  Agecat  
0                 1                  0                0    40.0  
1             

In [22]:
bank.groupby('Agecat')['Exited'].mean()

Agecat
10.0    0.061224
20.0    0.076005
30.0    0.108836
40.0    0.307869
50.0    0.560414
60.0    0.352000
70.0    0.102941
80.0    0.076923
90.0    0.000000
Name: Exited, dtype: float64