In [1]:
# Importing the necessary packages
import numpy as np                                  # "Scientific computing"
import scipy.stats as stats                         # Statistical tests

import pandas as pd                                 # Data Frame
from pandas.api.types import CategoricalDtype

import matplotlib.pyplot as plt                     # Basic visualisation

from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

![](img/penguins.png)

Churn prediction is one of the classic machine learning applications. Companies want to predict the likelihood of a customer or employee leaving. Customers or employees that are "in danger" can then get a special treatment. The dataset we use in this exercise contains historical data from bank customers. We know for each customer wether he/she left ("Exited") or not. 

In [2]:
churn = pd.read_csv('https://raw.githubusercontent.com/HOGENT-ML/course/main/datasets/churn.csv')
churn.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


Get some general info about the dataset (type of each column, null values, ...)

In [3]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


Perform basic data cleaning and preparation. 

Tip: use the solution of the exercise "demographic student score" as a source of inspiration. 

Remove the columns you don't need

In [4]:
churn.drop(columns=['RowNumber','CustomerId','Surname'], inplace=True)

Is this a skewed dataset?

In [5]:
churn['Exited'].value_counts(normalize=True)

Exited
0    0.7963
1    0.2037
Name: proportion, dtype: float64

What is X and what is y?

In [6]:
X = churn.drop(columns=['Exited'])
y = churn['Exited']

Define the data preparation for the categorical and numerical columns. 
Setting remainder='passthrough' will mean that all columns not specified in the list of "transformers" will be passed through without transformation, instead of being dropped.

In [7]:
churn.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
from sklearn.preprocessing import OneHotEncoder
categorical_ix = X.select_dtypes(include=['object']).columns
numerical_ix = X.select_dtypes(exclude=['object']).columns
categorical_ix, numerical_ix

(Index(['Geography', 'Gender'], dtype='object'),
 Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
        'IsActiveMember', 'EstimatedSalary'],
       dtype='object'))

In [10]:
col_transformer = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_ix),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_ix)
    ],
    remainder='passthrough'
)

What is X_train, y_train, X_test, y_test?

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Find a model for LogisticRegression, Support Vector Machines with 3d degree polynomial kernel, Decision Trees and Random Forest each with their default parameters. Which one gives the best accuracy?

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

classifiers = [('lr',LogisticRegression(random_state=42)),
               ('rf',RandomForestClassifier(random_state=42)),
               ('svc',SVC(kernel='poly',degree=3,random_state=42)),
               ('dt',DecisionTreeClassifier(random_state=42))]

for name, clf in classifiers:
    print(name)
    pipe = Pipeline(steps=[('preprocessor', col_transformer),
                           ('classifier', clf)])
    scores = cross_val_score(pipe, X_train, y_train, cv=3, scoring='accuracy')
    print(f'Accuracy: {scores.mean():.3f} +/- {scores.std():.3f}')
    print('---')

lr
Accuracy: 0.811 +/- 0.005
---
rf
Accuracy: 0.861 +/- 0.002
---
svc
Accuracy: 0.850 +/- 0.001
---
dt
Accuracy: 0.783 +/- 0.004
---


Does a soft voting classifier using the above classifiers perform better?

In [13]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=classifiers, voting='soft')
voting_clf.named_estimators['svc'].probability = True
pipeline = Pipeline(steps=[('preprocessor', col_transformer),
                           ('classifier', voting_clf)])
pipeline.fit(X_train, y_train)
acc = np.mean(cross_val_score(pipeline, X_train, y_train, cv=3, scoring='accuracy'))
print(f'Accuracy: {acc:.3f} +/- {scores.std():.3f}')

Accuracy: 0.849 +/- 0.004


Continue with the best model from  the 4 individual classifiers above and apply grid search to find the best parameter combination. 

What's the best parameter combination and the corresponding accuracy?

In [None]:
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline(steps=[('preprocessor', col_transformer),
                           ('rf', RandomForestClassifier(random_state=42))])
param_grid = [
    {
        'rf__n_estimators': [10, 50, 100],
        'rf__max_depth': [None, 5, 10],
        'rf__min_samples_split': [2, 5, 10],
        'rf__bootstrap': [True, False]    }
]

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


In [15]:
grid_search.best_params_


{'rf__bootstrap': True,
 'rf__max_depth': 10,
 'rf__min_samples_split': 2,
 'rf__n_estimators': 100}

In [16]:
grid_search.best_score_

np.float64(0.8628743706361536)

What is the accuracy score on the test set and what are the most important features?

In [17]:
from sklearn.metrics import accuracy_score
y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

0.862

In [18]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2, bootstrap=True, random_state=42)
pipeline = Pipeline(steps=[('preprocessor', col_transformer),('rf', rf)])
pipeline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)
importances = rf.feature_importances_

In [19]:
importances

array([0.08092656, 0.31023209, 0.04507938, 0.10384781, 0.22567242,
       0.01156514, 0.06418065, 0.08005501, 0.01123205, 0.03614914,
       0.00794945, 0.01127524, 0.01183506])

In [20]:
col_transformer.get_feature_names_out()

array(['num__CreditScore', 'num__Age', 'num__Tenure', 'num__Balance',
       'num__NumOfProducts', 'num__HasCrCard', 'num__IsActiveMember',
       'num__EstimatedSalary', 'cat__Geography_France',
       'cat__Geography_Germany', 'cat__Geography_Spain',
       'cat__Gender_Female', 'cat__Gender_Male'], dtype=object)

In [30]:
for score,name in zip(rf.feature_importances_, col_transformer.get_feature_names_out()):
    print(f'{name}: {score:.3f}')

num__CreditScore: 0.081
num__Age: 0.310
num__Tenure: 0.045
num__Balance: 0.104
num__NumOfProducts: 0.226
num__HasCrCard: 0.012
num__IsActiveMember: 0.064
num__EstimatedSalary: 0.080
cat__Geography_France: 0.011
cat__Geography_Germany: 0.036
cat__Geography_Spain: 0.008
cat__Gender_Female: 0.011
cat__Gender_Male: 0.012


Do Ada Boosting or Stacking lead to a better accuracy? 

For Stacking you can use the same estimators as you did for voting, but apply for the best classifier the optimal parameter combination you found above. 

In [32]:
from sklearn.ensemble import AdaBoostClassifier

X_train_prep = col_transformer.fit_transform(X_train)
X_test_prep = col_transformer.transform(X_test)

ada = AdaBoostClassifier(rf, n_estimators=100, random_state=42)
ada.fit(X_train_prep, y_train)
y_pred = ada.predict(X_test_prep)
accuracy_score(y_test, y_pred)  # 0.8667
print(f'Accuracy: {accuracy_score(y_test, y_pred):.3f} +/- {scores.std():.3f}')



Accuracy: 0.851 +/- 0.004


In [33]:
from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(estimators=classifiers, 
                                  final_estimator=RandomForestClassifier(random_state=42))

stacking_clf.fit(X_train_prep, y_train)
y_pred = stacking_clf.predict(X_test_prep)
accuracy_score(y_test, y_pred)  
print(f'Accuracy: {accuracy_score(y_test, y_pred):.3f} +/- {scores.std():.3f}')

Accuracy: 0.848 +/- 0.004


Conclusion: which model delivers the best results? 