In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import preprocessing

In [3]:
df = pd.read_csv("data_D.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,churn
0,0,106473,15639576,Sarratt,652.0,France,Female,65.0,3,0.0,2,1,1,136592.24,0
1,1,62345,15769582,Hanson,464.0,France,Male,35.0,4,0.0,1,0,0,99505.75,1
2,2,126615,15675888,Austin,620.0,Germany,Female,39.0,6,129401.87,2,1,1,102681.32,1
3,3,35909,15786617,Tuan,598.0,France,Female,30.0,7,0.0,2,1,0,141210.18,0
4,4,45175,15757310,Li Fonti,682.0,Germany,Female,46.0,4,107720.57,1,0,0,93832.33,1


# Data Cleaning

In [4]:
df.info()
#theres a null column which is "CreditScore"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41259 entries, 0 to 41258
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       41259 non-null  int64  
 1   id               41259 non-null  int64  
 2   CustomerId       41259 non-null  int64  
 3   Surname          41259 non-null  object 
 4   CreditScore      41251 non-null  float64
 5   Geography        41259 non-null  object 
 6   Gender           41259 non-null  object 
 7   Age              41259 non-null  float64
 8   Tenure           41259 non-null  int64  
 9   Balance          41259 non-null  float64
 10  NumOfProducts    41259 non-null  int64  
 11  HasCrCard        41259 non-null  int64  
 12  IsActiveMember   41259 non-null  int64  
 13  EstimatedSalary  41259 non-null  float64
 14  churn            41259 non-null  int64  
dtypes: float64(4), int64(8), object(3)
memory usage: 4.7+ MB


### fill in empty values

In [5]:
df["CreditScore"] = df["CreditScore"].fillna(df["CreditScore"].mean())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41259 entries, 0 to 41258
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       41259 non-null  int64  
 1   id               41259 non-null  int64  
 2   CustomerId       41259 non-null  int64  
 3   Surname          41259 non-null  object 
 4   CreditScore      41259 non-null  float64
 5   Geography        41259 non-null  object 
 6   Gender           41259 non-null  object 
 7   Age              41259 non-null  float64
 8   Tenure           41259 non-null  int64  
 9   Balance          41259 non-null  float64
 10  NumOfProducts    41259 non-null  int64  
 11  HasCrCard        41259 non-null  int64  
 12  IsActiveMember   41259 non-null  int64  
 13  EstimatedSalary  41259 non-null  float64
 14  churn            41259 non-null  int64  
dtypes: float64(4), int64(8), object(3)
memory usage: 4.7+ MB


### checking duplicated values

In [6]:
df.duplicated().sum()
# no duplicated rows

0

### removing some unecessary columns

In [7]:
df.drop(columns=["Unnamed: 0","id","CustomerId","Surname"], inplace=True)
df.head()  

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,churn
0,652.0,France,Female,65.0,3,0.0,2,1,1,136592.24,0
1,464.0,France,Male,35.0,4,0.0,1,0,0,99505.75,1
2,620.0,Germany,Female,39.0,6,129401.87,2,1,1,102681.32,1
3,598.0,France,Female,30.0,7,0.0,2,1,0,141210.18,0
4,682.0,Germany,Female,46.0,4,107720.57,1,0,0,93832.33,1


### cardinality checking

In [82]:
discrete_columns = ["Tenure", "NumOfProducts", "HasCrCard", "IsActiveMember", "churn"]
discrete_columns.extend(df.select_dtypes("object").columns.tolist())

unique = [df[name].value_counts() for name in discrete_columns]
unique

[2     4588
 4     4442
 7     4431
 5     4291
 8     4232
 3     4231
 1     4177
 9     4173
 6     3970
 10    1474
 0     1250
 Name: Tenure, dtype: int64,
 2    21132
 1    19316
 3      694
 4      117
 Name: NumOfProducts, dtype: int64,
 1    31126
 0    10133
 Name: HasCrCard, dtype: int64,
 0    20767
 1    20492
 Name: IsActiveMember, dtype: int64,
 0    32540
 1     8719
 Name: churn, dtype: int64,
 France     23520
 Spain       9081
 Germany     8658
 Name: Geography, dtype: int64,
 Male      23166
 Female    18093
 Name: Gender, dtype: int64]

- there's no typo
- from here we can see that there're some imbalance in variables "HasCrCard" and "churn"

### checking some error values by its range
eg: error values like minus for column "Age"

In [83]:
for col in df.select_dtypes(include="number").columns:
    print(f"the range of col-{col} uis between {df[col].min()} to {df[col].max()}")
    
#there's no error value

the range of col-CreditScore uis between 350.0 to 850.0
the range of col-Age uis between 18.0 to 92.0
the range of col-Tenure uis between 0 to 10
the range of col-Balance uis between 0.0 to 250898.09
the range of col-NumOfProducts uis between 1 to 4
the range of col-HasCrCard uis between 0 to 1
the range of col-IsActiveMember uis between 0 to 1
the range of col-EstimatedSalary uis between 11.58 to 199992.48
the range of col-churn uis between 0 to 1


### Checking Outliers
i dont include both of the imbalance variables bcs it will detect that the minority values are all outliers

In [85]:
df_clean = df.copy()

for col in df.select_dtypes("number").columns:
    if col not in ["HasCrCard","churn"]:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        IQR = q3 - q1
        outliers = df[(df[col] < (q1- 1.5*IQR)) | (df[col] > (q3 + 1.5*IQR))]
        df_clean = df_clean[~df_clean.index.isin(outliers.index)]
        print(f"{col}: {len(outliers)}")

print(f"total number of the outliers is {len(df)-len(df_clean)}")

CreditScore: 76
Age: 1560
Tenure: 0
Balance: 0
NumOfProducts: 117
IsActiveMember: 0
EstimatedSalary: 0
total number of the outliers is 1735


In [86]:
len(df_clean)

39524

# Feature Encoding

In [87]:
label_encoder = preprocessing.LabelEncoder()
label = {}
for col in df.select_dtypes("object"):
    df[col] = label_encoder.fit_transform(df[col])
    label[col] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    print(f"{col}= {label[col]}")

Geography= {'France': 0, 'Germany': 1, 'Spain': 2}
Gender= {'Female': 0, 'Male': 1}


In [88]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,churn
0,652.0,0,0,65.0,3,0.0,2,1,1,136592.24,0
1,464.0,0,1,35.0,4,0.0,1,0,0,99505.75,1
2,620.0,1,0,39.0,6,129401.87,2,1,1,102681.32,1
3,598.0,0,0,30.0,7,0.0,2,1,0,141210.18,0
4,682.0,1,0,46.0,4,107720.57,1,0,0,93832.33,1


In [89]:
label_encoder = preprocessing.LabelEncoder()
label = {}
for col in df_clean.select_dtypes("object"):
    df_clean[col] = label_encoder.fit_transform(df_clean[col])
    label[col] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    print(f"{col}= {label[col]}")

Geography= {'France': 0, 'Germany': 1, 'Spain': 2}
Gender= {'Female': 0, 'Male': 1}


In [71]:
df_clean.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,churn
1,464.0,0,1,35.0,4,0.0,1,0,0,99505.75,1
2,620.0,1,0,39.0,6,129401.87,2,1,1,102681.32,1
3,598.0,0,0,30.0,7,0.0,2,1,0,141210.18,0
4,682.0,1,0,46.0,4,107720.57,1,0,0,93832.33,1
5,746.0,0,0,24.0,1,0.0,2,0,0,130142.42,0


# Making models

In [91]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

### original df

In [90]:
input_df = df.drop(columns="churn")
output_df = df["churn"]

x_train,x_test,y_train,y_test = train_test_split(input_df, output_df, test_size=0.2, random_state=0)

In [93]:
model1 = RandomForestClassifier()
model1 = model1.fit(x_train, y_train)
y_predict = model1.predict(x_test)

print("Classfication Report\n", classification_report(y_test, y_predict))

Classfication Report
               precision    recall  f1-score   support

           0       0.88      0.94      0.91      6499
           1       0.70      0.52      0.60      1753

    accuracy                           0.85      8252
   macro avg       0.79      0.73      0.75      8252
weighted avg       0.84      0.85      0.84      8252



In [95]:
model2 = XGBClassifier()
model2 = model2.fit(x_train, y_train)
y_predict = model2.predict(x_test)

print("Classfication Report\n", classification_report(y_test, y_predict))

Classfication Report
               precision    recall  f1-score   support

           0       0.89      0.93      0.91      6499
           1       0.69      0.55      0.61      1753

    accuracy                           0.85      8252
   macro avg       0.79      0.74      0.76      8252
weighted avg       0.84      0.85      0.85      8252



both models yield the same accuracy but the XGBClassifier model classifier yields a higher f1-score

### df_clean

In [97]:
input_df = df_clean.drop(columns="churn")
output_df = df_clean["churn"]
x_train,x_test,y_train,y_test = train_test_split(input_df, output_df, test_size=0.2, random_state=0)

In [98]:
model3 = RandomForestClassifier()
model3 = model3.fit(x_train, y_train)
y_predict = model3.predict(x_test)

print("Classfication Report\n", classification_report(y_test, y_predict))

Classfication Report
               precision    recall  f1-score   support

           0       0.88      0.95      0.91      6295
           1       0.72      0.49      0.58      1610

    accuracy                           0.86      7905
   macro avg       0.80      0.72      0.75      7905
weighted avg       0.85      0.86      0.85      7905



In [99]:
model4 = XGBClassifier()
model4 = model4.fit(x_train, y_train)
y_predict = model4.predict(x_test)

print("Classfication Report\n", classification_report(y_test, y_predict))

Classfication Report
               precision    recall  f1-score   support

           0       0.88      0.95      0.91      6295
           1       0.71      0.50      0.59      1610

    accuracy                           0.86      7905
   macro avg       0.80      0.73      0.75      7905
weighted avg       0.85      0.86      0.85      7905



- both models yield the same accuracy but the XGB model yields a higher f1-score
- both models of df_cleaned yield higher accuracy than both models of original df but the f1-scores from df_cleaned models are lower
- bcs this's an imbalance case, "f1-score" is the appropriate metric. So far model2 with XGBClassifier by original df is the best model

# Tunning Parameters

In [129]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth':[2,4,6,8], 
    'max_features': ['sqrt', 0.3, 0.5, 0.8],
}
model = RandomForestClassifier()
model_clf= GridSearchCV(model,
                            param_grid = parameters,   
                            scoring='f1',       
                            cv=5)  
model_clf.fit(x_train,y_train)
print("Tuned Hyperparameters :", model_clf.best_params_)
print("f1-score :",model_clf.best_score_)

Tuned Hyperparameters : {'criterion': 'entropy', 'max_depth': 8, 'max_features': 0.8}
f1-score : 0.615775342368974


In [121]:
parameters = {
    'max_depth':[1, 3, 5, 7], 
    'n_estimators': [50, 100, 200,500],
    'gamma': [0, 0.1, 0.2],
}

model = XGBClassifier()
model_clf= GridSearchCV(model,
                            param_grid = parameters,   
                            scoring='f1',       
                            cv=5)  
model_clf.fit(x_train,y_train)
print("Tuned Hyperparameters :", model_clf.best_params_)
print("f1-score :",model_clf.best_score_)

Tuned Hyperparameters : {'gamma': 0, 'max_depth': 3, 'n_estimators': 100}
f1-score : 0.6115869427300563


### original df

In [128]:
input_df = df.drop(columns="churn")
output_df = df["churn"]

x_train,x_test,y_train,y_test = train_test_split(input_df, output_df, test_size=0.2, random_state=0)

In [117]:
model5 = RandomForestClassifier(criterion= 'entropy', max_depth= 8, max_features= 0.8)
model5 = model5.fit(x_train, y_train)
y_predict = model5.predict(x_test)

print("Classfication Report\n", classification_report(y_test, y_predict))

Classfication Report
               precision    recall  f1-score   support

           0       0.88      0.95      0.91      6499
           1       0.75      0.52      0.61      1753

    accuracy                           0.86      8252
   macro avg       0.81      0.74      0.76      8252
weighted avg       0.85      0.86      0.85      8252



In [123]:
model6 = XGBClassifier(gamma= 0, max_depth= 3, n_estimators = 100)
rf_model = model6.fit(x_train, y_train)
y_predict = model6.predict(x_test)

print("Classfication Report\n", classification_report(y_test, y_predict))

Classfication Report
               precision    recall  f1-score   support

           0       0.88      0.94      0.91      6499
           1       0.72      0.54      0.62      1753

    accuracy                           0.86      8252
   macro avg       0.80      0.74      0.77      8252
weighted avg       0.85      0.86      0.85      8252



### df_cleaned

In [124]:
input_df = df_clean.drop(columns="churn")
output_df = df_clean["churn"]
x_train,x_test,y_train,y_test = train_test_split(input_df, output_df, test_size=0.2, random_state=0)

In [120]:
model7 = RandomForestClassifier(criterion= 'entropy', max_depth= 8, max_features= 0.8)
model7 = model7.fit(x_train, y_train)
y_predict = model7.predict(x_test)

print("Classfication Report\n", classification_report(y_test, y_predict))

Classfication Report
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      6295
           1       0.75      0.50      0.60      1610

    accuracy                           0.86      7905
   macro avg       0.81      0.73      0.76      7905
weighted avg       0.85      0.86      0.85      7905



In [125]:
model8 = XGBClassifier(gamma= 0, max_depth= 3, n_estimators = 100)
model8 = model8.fit(x_train, y_train)
y_predict = model8.predict(x_test)

print("Classfication Report\n", classification_report(y_test, y_predict))

Classfication Report
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      6295
           1       0.74      0.50      0.60      1610

    accuracy                           0.86      7905
   macro avg       0.81      0.73      0.76      7905
weighted avg       0.85      0.86      0.85      7905



## CONCLUSION:

Based on the classification report results of the eight models, it can be concluded that model6 with the original dataframe using XGBClassifier is the best performing model with the highest f1-score among the two labels.