In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier 
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd

In [2]:
models = {
    'LogisticRegression': LogisticRegression(),
    'GaussianNB': GaussianNB(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'XGBClassifier': XGBClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier':GradientBoostingClassifier(),
    'SVC': SVC()
}
params = {
    'LogisticRegression': {'penalty':['l1', 'l2', 'elasticnet', None],'solver':['lbfgs','newton-cg','newton-cholesky','sag','saga']},
    'GaussianNB': {'var_smoothing': np.logspace(0,-9, num=100)},
    'KNeighborsClassifier': {'n_neighbors':[1,2,3,4,5,6,7],'weights':['uniform', 'distance'],'algorithm':['auto', 'ball_tree', 'kd_tree']},
    'SVC': {'C':[0.1, 1, 10, 100],'gamma': [0.01, 0.1, 1, 10],'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
    'DecisionTreeClassifier': {'criterion': ['gini', 'entropy'],'max_depth': [None, 5, 10, 15],'min_samples_split': [2, 5, 10]},
    'XGBClassifier': {'learning_rate': [0.01, 0.1, 0.2],'max_depth': [3, 5, 7],'n_estimators': [50, 100, 200]},
    'RandomForestClassifier': {'n_estimators':[10,25,50,100],'criterion':['gini', 'entropy', 'log_loss'],'max_depth':[1,3,5,7]},
    'GradientBoostingClassifier': {'loss':['log_loss','exponential'],'learning_rate':[0.01,0.1,0.2],
                                   'criterion':['friedman_mse', 'squared_error'],'max_depth':[1,3,5,7]}
}

In [3]:
df = pd.read_csv('data/X.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Churn
0,0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [4]:
X = df.drop(columns=['Churn','Unnamed: 0'],axis=1)
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97
1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5
2,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88
4,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83


In [5]:
y = df['Churn']
y

0         0
1         0
2         0
3         0
4         0
         ..
165029    0
165030    0
165031    0
165032    0
165033    1
Name: Churn, Length: 165034, dtype: int64

In [6]:
# Create Column Transformer with 3 types of transformers
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns


numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder',oh_transformer,cat_features),
        ('StandardScaler',numeric_transformer,num_features)
    ]
)

In [7]:
X = preprocessor.fit_transform(X)

In [8]:
X.shape

(165034, 13)

In [9]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape

((132027, 13), (33007, 13))

In [10]:
def evaluate_model(true,predicted):
    accuracy = accuracy_score(true,predicted)
    confusion_m = confusion_matrix(true,predicted)
    class_repo = classification_report(true,predicted)
    return accuracy,confusion_m,class_repo

In [11]:
model_list = []
acc_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    #Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)


    #Evaluation of train and test dataset
    model_train_acc,model_train_conf,model_train_class_repo = evaluate_model(y_train,y_train_pred)
    model_test_acc,model_test_conf,model_test_class_repo = evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_acc))
    print("- Confusion Matrix: {}\n".format(model_train_conf))
    print("- classification Score: \n" ,(model_train_class_repo))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(model_test_acc))
    print("- Confusion Matrix: {}\n".format(model_test_conf))
    print("- classification Score: \n",(model_test_class_repo))
    acc_list.append(model_test_acc)

    print('='*35)
    print('\n')

LogisticRegression
Model performance for Training set
- Accuracy: 0.8338
- Confusion Matrix: 
- classification Score: 
               precision    recall  f1-score   support

           0       0.85      0.96      0.90    104061
           1       0.70      0.38      0.49     27966

    accuracy                           0.83    132027
   macro avg       0.77      0.67      0.70    132027
weighted avg       0.82      0.83      0.81    132027

----------------------------------
Model performance for Test set
- Accuracy: 0.8354
- Confusion Matrix: 
- classification Score: 
               precision    recall  f1-score   support

           0       0.85      0.95      0.90     26052
           1       0.70      0.39      0.50      6955

    accuracy                           0.84     33007
   macro avg       0.78      0.67      0.70     33007
weighted avg       0.82      0.84      0.82     33007



GaussianNB
Model performance for Training set
- Accuracy: 0.8009
- Confusion Matrix: 
- clas

In [12]:
pd.DataFrame(list(zip(model_list, acc_list)), columns=['Model Name', 'Accuracy_Score']).sort_values(by=["Accuracy_Score"],ascending=False)

Unnamed: 0,Model Name,Accuracy_Score
6,GradientBoostingClassifier,0.866362
4,XGBClassifier,0.86621
7,SVC,0.863423
5,RandomForestClassifier,0.858515
2,KNeighborsClassifier,0.846669
0,LogisticRegression,0.835429
1,GaussianNB,0.801042
3,DecisionTreeClassifier,0.79674
