In [35]:
# import the datafile 
import pandas as pd
url = 'https://raw.githubusercontent.com/HOGENT-Databases/BI-BigData/master/data/bank_churn.csv'
bank = pd.read_csv(url,sep=',')
bank.shape

(10000, 14)

In [36]:
bank.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [20]:
print(bank.head(20))

    RowNumber  CustomerId    Surname  CreditScore Geography  Gender  Age  \
0           1    15634602   Hargrave          619    France  Female   42   
1           2    15647311       Hill          608     Spain  Female   41   
2           3    15619304       Onio          502    France  Female   42   
3           4    15701354       Boni          699    France  Female   39   
4           5    15737888   Mitchell          850     Spain  Female   43   
5           6    15574012        Chu          645     Spain    Male   44   
6           7    15592531   Bartlett          822    France    Male   50   
7           8    15656148     Obinna          376   Germany  Female   29   
8           9    15792365         He          501    France    Male   44   
9          10    15592389         H?          684    France    Male   27   
10         11    15767821     Bearce          528    France    Male   31   
11         12    15737173    Andrews          497     Spain    Male   24   
12         1

In [37]:
bank.groupby(bank.Exited).count()

Unnamed: 0_level_0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
Exited,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,7963,7963,7963,7963,7963,7963,7963,7963,7963,7963,7963,7963,7963
1,2037,2037,2037,2037,2037,2037,2037,2037,2037,2037,2037,2037,2037


In [38]:
# drop features that seem irrelevant
bank = bank.drop(['RowNumber','CustomerId','Surname'],axis=1)

In [39]:
# Count missing values as proportion of total number of rows
bank.isna().sum()/len(bank)

CreditScore        0.0
Geography          0.0
Gender             0.0
Age                0.0
Tenure             0.0
Balance            0.0
NumOfProducts      0.0
HasCrCard          0.0
IsActiveMember     0.0
EstimatedSalary    0.0
Exited             0.0
dtype: float64

In [24]:
# use one-hot encoding for gender and geography
bank = pd.get_dummies(bank,columns=['Gender','Geography'])

In [25]:
bank.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Gender_Female,Gender_Male,Geography_France,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,1,0,1,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0,0,0,1
2,502,42,8,159660.8,3,1,0,113931.57,1,1,0,1,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,1,0,1,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0,0,0,1


In [26]:
# determine feature set and label

X = bank.drop('Exited',axis=1)
y = bank['Exited']

In [27]:
# build model (see course Databases III)
from sklearn.model_selection import train_test_split
X_remainder, X_test, y_remainder, y_test = train_test_split(X,y,test_size=0.30)

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

best_accuracy = 0
best_trees = 0

for trees in range(50,550,50):
    X_train, X_validation, y_train, y_validation = train_test_split(X_remainder,y_remainder,test_size=0.30)
    model = RandomForestClassifier(n_estimators=trees)
    model.fit(X_train, y_train)    
    y_validation2 = model.predict(X_validation)
    accuracy = accuracy_score(y_validation, y_validation2)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_trees = trees
        best_validation = model.predict(X_test)
        
print('Optimal number of trees = % s' %(best_trees))
print('Accuracy on validation set = % 3.2f' % (best_accuracy)) 
accuracyOnTestSet = accuracy_score(y_test, best_validation)
print('Accuracy on test set = % 3.2f' % (accuracyOnTestSet))

Optimal number of trees = 450
Accuracy on validation set =  0.87
Accuracy on test set =  0.87


In [28]:
# determine feature importances
print(X_train.columns)
print(model.feature_importances_)

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Gender_Female', 'Gender_Male',
       'Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype='object')
[0.14258281 0.22987379 0.08557601 0.13819806 0.1298737  0.02011649
 0.04373948 0.14540538 0.01104715 0.01050101 0.01122268 0.02209685
 0.0097666 ]


In [29]:
# we now combine those two collections into a dataframe
importances = pd.DataFrame(model.feature_importances_,columns=['Importance'],index=X_train.columns). \
sort_values(by='Importance',ascending=False).reset_index()

In [30]:
importances

Unnamed: 0,index,Importance
0,Age,0.229874
1,EstimatedSalary,0.145405
2,CreditScore,0.142583
3,Balance,0.138198
4,NumOfProducts,0.129874
5,Tenure,0.085576
6,IsActiveMember,0.043739
7,Geography_Germany,0.022097
8,HasCrCard,0.020116
9,Geography_France,0.011223


In [31]:
# We can group the relative importances for one-hot encoded features together and make the sum of there values: 
importances['index'] = np.where(importances['index'].str.startswith ('Gender'),'Gender',importances['index'])
importances['index'] = np.where(importances['index'].str.startswith ('Geography'),'Geography',importances['index'])
imp = importances.groupby(['index'])['Importance'].sum().reset_index().sort_values(by='Importance',ascending=False).reset_index()
imp

Unnamed: 0,level_0,index,Importance
0,0,Age,0.229874
1,3,EstimatedSalary,0.145405
2,2,CreditScore,0.142583
3,1,Balance,0.138198
4,8,NumOfProducts,0.129874
5,9,Tenure,0.085576
6,7,IsActiveMember,0.043739
7,5,Geography,0.043086
8,4,Gender,0.021548
9,6,HasCrCard,0.020116


In [32]:
bank.groupby(bank.Exited).mean()

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Gender_Female,Gender_Male,Geography_France,Geography_Germany,Geography_Spain
Exited,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,651.853196,37.408389,5.033279,72745.296779,1.544267,0.707146,0.554565,99738.391772,0.427477,0.572523,0.527942,0.212859,0.259199
1,645.351497,44.837997,4.932744,91108.539337,1.475209,0.699067,0.360825,101465.677531,0.559156,0.440844,0.397644,0.399607,0.202749


In [33]:
# We will now use this model to predict wether or not some current customers might leave or not.
# This will typically be part of a end-user application and run e.g. weekly on the customer database

def PredictSwitch(model,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,
                  HasCrCard,IsActiveMember, EstimatedSalary):
    import pandas as pd
    customer=pd.DataFrame(columns=['CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary'])

    new_customer = {'CreditScore':CreditScore, 'Geography':Geography,
       'Gender':Gender, 'Age':Age, 'Tenure':Tenure, 'Balance':Balance, 
       'NumOfProducts':NumOfProducts, 'HasCrCard':HasCrCard,
       'IsActiveMember':IsActiveMember, 'EstimatedSalary':EstimatedSalary}
    
    customer = customer.append(new_customer,ignore_index=True)

    if Gender == 'Male':
        customer['Gender_Male'] = 1
        customer['Gender_Female'] = 0
    else:
        customer['Gender_Male'] = 0
        customer['Gender_Female'] = 1        
    customer.drop(columns=['Gender'],axis=1,inplace=True)

    customer['Geography_France'] = 0
    customer['Geography_Germany'] = 0
    customer['Geography_Spain'] = 0

    customer['Geography' +'_'+Geography] = 1

    customer.drop(['Geography'],axis=1,inplace=True)

    
    # In practice the model will be saved to a file after building and fine-tuning 
    # and loaded from that file in this function
    Exited = model.predict(customer)
    
    # most sklearn algorithms also offer a predict_proba method that returns an array 
    # of probabilities per class:
    Exited_proba = model.predict_proba(customer)
    return Exited[0],Exited_proba[0].max()


Exited = PredictSwitch(model,CreditScore=502, Geography='Spain',Gender='Male',Age=20,Tenure=1,Balance=10000,NumOfProducts=3,HasCrCard=1,IsActiveMember=0, EstimatedSalary=100000)

print(Exited)

Exited = PredictSwitch(model,CreditScore=502, Geography='Spain',Gender='Female',Age=41,Tenure=1,Balance=50000,NumOfProducts=1,HasCrCard=1,IsActiveMember=0, EstimatedSalary=200000)

print(Exited)



(1, 0.728)
(0, 0.56)
