### Overview
    
    Welcome to the 2024 Kaggle Playground Series! Happy New Year! This is the 1st episode of Season 4. We plan to continue in the spirit of previous playgrounds, providing interesting an approachable datasets for our community to practice their machine learning skills, and anticipate a competition each month.

    Our Goal: For this Episode of the Series, our task is to predict whether a customer continues with their account or closes it (e.g., churns). Good luck!

In [1]:
# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import csv

In [2]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [3]:
## dropping the column which doesn't affect on exited.

columns_to_drop = ['id', 'Surname']
data = df.drop(columns = columns_to_drop, axis = 1)

data.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15674932,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,15749177,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,15694510,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,15741417,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [4]:
data['Gen'] = (data['Gender'] == 'Male').astype(int)
data.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Gen
0,15674932,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0,1
1,15749177,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0,1
2,15694510,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0,1
3,15741417,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1
4,15766172,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0,1


In [5]:
data.drop(columns = ['Gender'], inplace = True, axis = 1)
data.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Gen
0,15674932,668,France,33.0,3,0.0,2,1.0,0.0,181449.97,0,1
1,15749177,627,France,33.0,1,0.0,2,1.0,1.0,49503.5,0,1
2,15694510,678,France,40.0,10,0.0,2,1.0,0.0,184866.69,0,1
3,15741417,581,France,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1
4,15766172,716,Spain,33.0,5,0.0,2,1.0,1.0,15068.83,0,1


In [6]:
data['Geography'].value_counts()

France     94215
Spain      36213
Germany    34606
Name: Geography, dtype: int64

#### one hot encoding for Geography column

In [7]:
dummies = pd.get_dummies(data.Geography)
dummies.head()

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,0,0,1


In [8]:
merged = pd.concat([data, dummies], axis = 'columns')
merged.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Gen,France,Germany,Spain
0,15674932,668,France,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,1,0,0
1,15749177,627,France,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,1,0,0
2,15694510,678,France,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,1,0,0
3,15741417,581,France,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,1,0,0
4,15766172,716,Spain,33.0,5,0.0,2,1.0,1.0,15068.83,0,1,0,0,1


In [9]:
data = merged.drop(['Geography', 'Germany'], axis = 1)
data.head()

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Gen,France,Spain
0,15674932,668,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,1,0
1,15749177,627,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,1,0
2,15694510,678,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,1,0
3,15741417,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,1,0
4,15766172,716,33.0,5,0.0,2,1.0,1.0,15068.83,0,1,0,1


In [10]:
data.drop(columns = ['CustomerId'], inplace = True, axis = 1)
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Gen,France,Spain
0,668,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,1,0
1,627,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,1,0
2,678,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,1,0
3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,1,0
4,716,33.0,5,0.0,2,1.0,1.0,15068.83,0,1,0,1


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   CreditScore      165034 non-null  int64  
 1   Age              165034 non-null  float64
 2   Tenure           165034 non-null  int64  
 3   Balance          165034 non-null  float64
 4   NumOfProducts    165034 non-null  int64  
 5   HasCrCard        165034 non-null  float64
 6   IsActiveMember   165034 non-null  float64
 7   EstimatedSalary  165034 non-null  float64
 8   Exited           165034 non-null  int64  
 9   Gen              165034 non-null  int32  
 10  France           165034 non-null  uint8  
 11  Spain            165034 non-null  uint8  
dtypes: float64(5), int32(1), int64(4), uint8(2)
memory usage: 12.3 MB


In [12]:
data['Exited'].value_counts()

0    130113
1     34921
Name: Exited, dtype: int64

In [13]:
x = data.drop(columns = 'Exited', axis = 1)
y = data['Exited']

In [14]:
x.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Gen,France,Spain
0,668,33.0,3,0.0,2,1.0,0.0,181449.97,1,1,0
1,627,33.0,1,0.0,2,1.0,1.0,49503.5,1,1,0
2,678,40.0,10,0.0,2,1.0,0.0,184866.69,1,1,0
3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,1,1,0
4,716,33.0,5,0.0,2,1.0,1.0,15068.83,1,0,1


In [15]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Exited, dtype: int64

In [16]:
print(x.shape, y.shape)

(165034, 11) (165034,)


In [17]:
model = LogisticRegression()

In [18]:
model.fit(x,y)

In [19]:
x_pred = model.predict(x)

In [20]:
accuracy = accuracy_score(y, x_pred)
print(f"Accuracy for train data {accuracy*100}%")

Accuracy for train data 78.56562890071137%


In [21]:
df = pd.read_csv('../data/test.csv')
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [22]:
columns_to_drop = ['id', 'CustomerId', 'Surname']
data = df.drop(columns = columns_to_drop, axis = 1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [23]:
data['Gen'] = (data['Gender'] == 'Male').astype(int)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Gen
0,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75,0
1,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27,0
2,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09,0
3,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57,1
4,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0,1


In [24]:
data.drop(columns = ['Gender'], inplace = True, axis = 1)
data.head()

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Gen
0,586,France,23.0,2,0.0,2,0.0,1.0,160976.75,0
1,683,France,46.0,2,0.0,1,1.0,0.0,72549.27,0
2,656,France,34.0,7,0.0,2,1.0,0.0,138882.09,0
3,681,France,36.0,8,0.0,1,1.0,0.0,113931.57,1
4,752,Germany,38.0,10,121263.62,1,1.0,0.0,139431.0,1


In [25]:
dummies = pd.get_dummies(data.Geography)
dummies.head()

merged = pd.concat([data, dummies], axis = 'columns')
merged.head()

data = merged.drop(['Geography', 'Germany'], axis = 1)
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Gen,France,Spain
0,586,23.0,2,0.0,2,0.0,1.0,160976.75,0,1,0
1,683,46.0,2,0.0,1,1.0,0.0,72549.27,0,1,0
2,656,34.0,7,0.0,2,1.0,0.0,138882.09,0,1,0
3,681,36.0,8,0.0,1,1.0,0.0,113931.57,1,1,0
4,752,38.0,10,121263.62,1,1.0,0.0,139431.0,1,0,0


In [26]:
x_test = data

In [27]:
x_test.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Gen,France,Spain
0,586,23.0,2,0.0,2,0.0,1.0,160976.75,0,1,0
1,683,46.0,2,0.0,1,1.0,0.0,72549.27,0,1,0
2,656,34.0,7,0.0,2,1.0,0.0,138882.09,0,1,0
3,681,36.0,8,0.0,1,1.0,0.0,113931.57,1,1,0
4,752,38.0,10,121263.62,1,1.0,0.0,139431.0,1,0,0


In [29]:
print(x_test.shape)

(110023, 11)


In [31]:
prediction = model.predict(x_test)
type(prediction)

numpy.ndarray

In [35]:
# Assuming predictions is a NumPy array and test_df is your test dataset
# Make sure the order of CustomerID in test_df matches the order in your predictions

# Convert predictions to a Pandas DataFrame
predictions_df = pd.DataFrame({'Exited': prediction.flatten()})  # Assuming predictions is a 1D array

# Add CustomerID from the test dataset
submission_df = pd.concat([df['CustomerId'], predictions_df], axis=1)

# Save the submission file
submission_df.to_csv('submission_file.csv', index=False)

In [36]:
sample_data = pd.read_csv('../data/sample_submission.csv')
sample_data.head()

Unnamed: 0,id,Exited
0,165034,0.5
1,165035,0.5
2,165036,0.5
3,165037,0.5
4,165038,0.5


In [38]:
y_test = sample_data['Exited']
y_test.head()

0    0.5
1    0.5
2    0.5
3    0.5
4    0.5
Name: Exited, dtype: float64

In [40]:
y_test.value_counts()

0.5    110023
Name: Exited, dtype: int64