## 1. Notebook Preparation

### Import dependencies

In [52]:
import os
import zipfile
import numpy as np
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi

### Initialize constants and paths

In [21]:
# Data
COMPETITION_NAME = 'playground-series-s4e1'
ORIGINAL_DATASET = 'shubhammeshram579/bank-customer-churn-prediction'

# Paths
MAIN_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))

### Download the competition files

In [24]:
if not os.path.exists(MAIN_PATH):
    os.makedirs(MAIN_PATH, exist_ok=True)

api = KaggleApi()
api.authenticate()

# Download the dataset to the local folder and unzip
api.dataset_download_files(ORIGINAL_DATASET, path=MAIN_PATH, unzip=True)
api.competition_download_files(COMPETITION_NAME, path=MAIN_PATH)

zip_file_path = os.path.join(MAIN_PATH, f'{COMPETITION_NAME}.zip')

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(MAIN_PATH)

os.remove(zip_file_path)



## 2. Load Data

In [25]:
original_data = pd.read_csv(f'{MAIN_PATH}/Churn_Modelling.csv')

df_train = pd.read_csv(f'{MAIN_PATH}/train.csv')
df_test = pd.read_csv(f'{MAIN_PATH}/test.csv')

submission = pd.read_csv(f'{MAIN_PATH}/sample_submission.csv')

## 3. Data Exploration & Upsizing Train Set

### Exploration: Original Data

In [29]:
original_data.shape

(10002, 14)

In [30]:
original_data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [41]:
original_data.duplicated().sum()

2

In [42]:
original_data.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          1
Gender             0
Age                1
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          1
IsActiveMember     1
EstimatedSalary    0
Exited             0
dtype: int64

### Exploration: Train Set

In [27]:
df_train.shape

(165034, 14)

In [28]:
df_train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [36]:
df_train.duplicated().sum()

0

In [37]:
df_train.isna().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

### Exploration: Test Set

In [31]:
df_test.shape

(110023, 13)

In [32]:
df_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [40]:
df_test.duplicated().sum()

0

In [39]:
df_test.isna().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

### Exploration: Sample Submission


In [35]:
submission.head()

Unnamed: 0,id,Exited
0,165034,0.5
1,165035,0.5
2,165036,0.5
3,165037,0.5
4,165038,0.5


### Upsizing Train Set
#### The provided train set from the competition is a synthetically-generated datasets that is based on the Churn Modelling dataset - also available on Kaggle. To improve model performance I am adding the original data to the competition data, as a larger training set helps the model extract information more reliably.

In [43]:
# Remove the ID / RowNumber column
df_train = df_train.drop("id", axis=1)
original_data = original_data.drop("RowNumber", axis=1)

In [44]:
# Add original data to the train set
df_train = pd.concat([df_train, original_data], axis=0)

In [46]:
df_train.shape


(175036, 13)

In [47]:
df_train.duplicated().sum()


2

In [48]:
df_train.isna().sum()


CustomerId         0
Surname            0
CreditScore        0
Geography          1
Gender             0
Age                1
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          1
IsActiveMember     1
EstimatedSalary    0
Exited             0
dtype: int64

#### There are some duplicated rows in the now upsized train set, as well as some rows with missing values:

- The duplicated rows will be dropped to keep each row unique.
- For the missing values, there are two options:
  - Impute the missing data
  - Drop the rows with missing data

####  Since there is a negligible number of rows that have missing values, I chose to drop these as well.

In [49]:
df_train = df_train.drop_duplicates().dropna()


In [50]:
df_train.duplicated().sum()


0

In [51]:
df_train.isna().sum()

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

### Exploration: Categorical Features

In [56]:
categorical_data = df_train.select_dtypes(exclude=[np.number])

In [57]:
categorical_data.head()

Unnamed: 0,Surname,Geography,Gender
0,Okwudilichukwu,France,Male
1,Okwudiliolisa,France,Male
2,Hsueh,France,Male
3,Kao,France,Male
4,Chiemenam,Spain,Male


#### The surname feature is really not important to our model (and it's technically not categorical), so I'll drop it.

In [58]:
categorical_data = categorical_data.drop("Surname", axis=1)
categorical_data.head()

Unnamed: 0,Geography,Gender
0,France,Male
1,France,Male
2,France,Male
3,France,Male
4,Spain,Male


### Exploration: Numerical Features

In [59]:
numeric_data = df_train.select_dtypes(include=[np.number])


In [60]:
numeric_data.head()


Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15674932,668,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,15749177,627,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,15694510,678,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,15741417,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,716,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [61]:
corr = numeric_data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
CustomerId,1.0,0.007299,0.003073,-0.002053,-0.008836,0.005178,-0.005875,-0.002693,0.003875,-0.009723
CreditScore,0.007299,1.0,-0.008883,0.00095,0.005592,0.011605,-0.002607,0.015358,-0.000791,-0.027241
Age,0.003073,-0.008883,1.0,-0.010786,0.063271,-0.097297,-0.012604,0.00896,-0.006685,0.336606
Tenure,-0.002053,0.00095,-0.010786,1.0,-0.009642,0.007707,0.006412,-0.006868,0.001455,-0.019244
Balance,-0.008836,0.005592,0.063271,-0.009642,1.0,-0.357267,-0.020303,-0.014139,0.004402,0.128389
NumOfProducts,0.005178,0.011605,-0.097297,0.007707,-0.357267,1.0,0.005612,0.037827,-0.002422,-0.204509
HasCrCard,-0.005875,-0.002607,-0.012604,0.006412,-0.020303,0.005612,1.0,-0.020682,0.004936,-0.021122
IsActiveMember,-0.002693,0.015358,0.00896,-0.006868,-0.014139,0.037827,-0.020682,1.0,-0.008713,-0.207205
EstimatedSalary,0.003875,-0.000791,-0.006685,0.001455,0.004402,-0.002422,0.004936,-0.008713,1.0,0.018598
Exited,-0.009723,-0.027241,0.336606,-0.019244,0.128389,-0.204509,-0.021122,-0.207205,0.018598,1.0


#### The following 3 correclations are espsecially noticeable and seem to have a strong impact on the likelihood of a customer choosing to exit or remain with the service.

#### **Age - Exited** *↗ Positive Correlation*: As a customers age rises, so does the likelihood of them leaving the service.
#### **NumOfProducts - Exited** *↘ Negative Correlation*: If a customer owns more proudcts, it's less likely that they will leave.
#### **IsActiveMember - Exited** *↘ Negative Correlation*: If a customer is more active, it's less likely that they will leave.

## 4. Data Preprocessing