## Competitions: Kaggle Season 4 Episode 1 - Binary Classification with a Bank Churn Dataset 
https://www.kaggle.com/competitions/playground-series-s4e1/data

### Imports and Read Data

In [2]:
import pandas as pd
import numpy as np
import sklearn

In [11]:
TRAIN = pd.read_csv("./data/train.csv", na_values='NaN')
TEST = pd.read_csv("./data/test.csv", na_values='NaN')
SAMPLE_SUBMISSION = pd.read_csv('./data/sample_submission.csv')

In [12]:
TEST.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [13]:

TRAIN.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


### Basic Data Exploration

NOTES:
- No missing data
- 3 objects: Surname (2797 Surnames), Gender (Male, Female), Geography (France, Germany, Spain)
- 2 IDs: CustomerID and ID (There are about 8 times more IDs than customer IDs (Based on Tenure?))

In [14]:
pd.options.display.float_format = '{:,.2f}'.format
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values 
    summ['%missing'] = df.isnull().sum().values / len(df) * 100
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['average'] = desc['mean'].values
    summ['standard_deviation'] = desc['std'].values
    summ['first value'] = df.loc[0].values
    summ['second value'] = df.loc[1].values
    summ['third value'] = df.loc[2].values
    
    return summ

summary(TRAIN)

data shape: (165034, 14)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,average,standard_deviation,first value,second value,third value
id,int64,0,0.0,165034,0.0,165033.0,82516.5,47641.36,0,1,2
CustomerId,int64,0,0.0,23221,15565701.0,15815690.0,15692005.02,71397.82,15674932,15749177,15694510
Surname,object,0,0.0,2797,,,,,Okwudilichukwu,Okwudiliolisa,Hsueh
CreditScore,int64,0,0.0,457,350.0,850.0,656.45,80.1,668,627,678
Geography,object,0,0.0,3,,,,,France,France,France
Gender,object,0,0.0,2,,,,,Male,Male,Male
Age,float64,0,0.0,71,18.0,92.0,38.13,8.87,33.00,33.00,40.00
Tenure,int64,0,0.0,11,0.0,10.0,5.02,2.81,3,1,10
Balance,float64,0,0.0,30075,0.0,250898.09,55478.09,62817.66,0.00,0.00,0.00
NumOfProducts,int64,0,0.0,4,1.0,4.0,1.55,0.55,2,2,2


### Data Cleaning

In [33]:
train = TRAIN.copy()
train.drop(columns=['id'], inplace=True)
train.head()
dummies = pd.get_dummies(train[['Gender', 'Geography']], prefix='', prefix_sep='')
train_d = pd.concat([train, dummies], axis=1)
test = train_d.sort_values(by=['CustomerId', 'Tenure'])
test.head()
t = train_d.groupby(by=['CustomerId']).size().reset_index(name='count')
t.sort_values(by=['count', 'CustomerId'])

Unnamed: 0,CustomerId,count
3,15565759,1
7,15565807,1
10,15565891,1
12,15566028,1
18,15566187,1
...,...,...
21154,15793331,90
2953,15595588,91
1775,15585835,98
241,15570194,99


In [30]:
test.loc[test.CustomerId == 15682355, :]

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,...,Zubarev,Zubareva,Zuev,Zuyev,Zuyeva,France,Germany,Spain,Female,Male
20482,15682355,663,35.00,0,0.00,1,1.00,0.00,138539.73,1,...,0,0,0,0,0,1,0,0,0,1
24598,15682355,813,33.00,0,117419.35,1,0.00,1.00,120736.04,0,...,0,0,0,0,0,0,1,0,0,1
35178,15682355,717,37.00,0,0.00,1,1.00,0.00,162697.54,1,...,0,0,0,0,0,1,0,0,1,0
5063,15682355,790,44.00,1,128100.75,1,1.00,0.00,164517.96,1,...,0,0,0,0,0,0,0,1,0,1
63843,15682355,603,27.00,1,131647.01,1,1.00,0.00,88890.05,0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138764,15682355,805,37.00,9,92855.02,2,1.00,0.00,92985.78,1,...,0,0,0,0,0,0,1,0,1,0
147394,15682355,553,51.00,9,132310.39,1,1.00,1.00,92934.93,1,...,0,0,0,0,0,0,0,1,1,0
154632,15682355,593,40.00,9,0.00,2,0.00,1.00,119882.91,0,...,0,0,0,0,0,0,0,1,1,0
30924,15682355,710,42.00,10,138053.97,1,1.00,0.00,114957.85,0,...,0,0,0,0,0,1,0,0,1,0


### Data Model

LSTM Model? (Something with memory to use based on Tenure?)