### Overview
    
    Welcome to the 2024 Kaggle Playground Series! Happy New Year! This is the 1st episode of Season 4. We plan to continue in the spirit of previous playgrounds, providing interesting an approachable datasets for our community to practice their machine learning skills, and anticipate a competition each month.

Our Goal: For this Episode of the Series, our task is to predict whether a customer continues with their account or closes it (e.g., churns). Good luck!

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#### Read data

In [2]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


#### Dropping Unnecessary Columns

In [3]:
columns_to_drop = ['id', 'Surname']
data = df.drop(columns=columns_to_drop, axis=1)

In [4]:
# Create a binary column for gender (1 for Male, 0 for Female)
data['Gen'] = (data['Gender'] == 'Male').astype(int)

In [5]:
# Drop the original 'Gender' column
data.drop(columns=['Gender'], inplace=True, axis=1)

#### One-hot encoding for 'Geography' column

In [6]:
dummies = pd.get_dummies(data.Geography)
merged = pd.concat([data, dummies], axis='columns')
data = merged.drop(['Geography', 'Germany'], axis=1)

In [7]:
# Drop 'CustomerId' column
data.drop(columns=['CustomerId'], inplace=True, axis=1)

In [8]:
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Gen,France,Spain
0,668,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,1,0
1,627,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,1,0
2,678,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,1,0
3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,1,0
4,716,33.0,5,0.0,2,1.0,1.0,15068.83,0,1,0,1


#### Define features (X) and target variable (y)

In [9]:
X = data.drop(columns='Exited', axis=1)
y = data['Exited']

In [10]:
# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#### Standardize the features

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

#### Build a neural network model

In [12]:
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

#### compile the model

In [13]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1fc0d08beb0>

In [15]:
# Evaluate the model on the training set
train_accuracy = model.evaluate(X_train, y_train, verbose=0)[1]
print(f"Accuracy for train data: {train_accuracy * 100}%")

Accuracy for train data: 86.57168745994568%


In [16]:
# Evaluate the model on the validation set
val_accuracy = model.evaluate(X_val, y_val, verbose=0)[1]
print(f"Accuracy for validation data: {val_accuracy * 100}%")

Accuracy for validation data: 86.38470768928528%


### Load the test data

In [17]:
df_test = pd.read_csv('../data/test.csv')
columns_to_drop_test = ['id', 'CustomerId', 'Surname']
data_test = df_test.drop(columns=columns_to_drop_test, axis=1)
data_test.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


#### Processing the test data similar to the training data

In [18]:
data_test['Gen'] = (data_test['Gender'] == 'Male').astype(int)
data_test.drop(columns=['Gender'], inplace=True, axis=1)

dummies_test = pd.get_dummies(data_test.Geography)
merged_test = pd.concat([data_test, dummies_test], axis='columns')

data_test = merged_test.drop(['Geography', 'Germany'], axis=1)

In [19]:
data_test.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Gen,France,Spain
0,586,23.0,2,0.0,2,0.0,1.0,160976.75,0,1,0
1,683,46.0,2,0.0,1,1.0,0.0,72549.27,0,1,0
2,656,34.0,7,0.0,2,1.0,0.0,138882.09,0,1,0
3,681,36.0,8,0.0,1,1.0,0.0,113931.57,1,1,0
4,752,38.0,10,121263.62,1,1.0,0.0,139431.0,1,0,0


In [20]:
# Standardize the test features
X_test = scaler.transform(data_test)

In [21]:
# Make predictions on the test set
predictions = model.predict(X_test)



In [23]:
# Add CustomerID from the test dataset
submission_df = pd.concat([df_test['CustomerId'], pd.DataFrame(predictions, columns=['Exited'])], axis=1)

In [24]:
# Save the submission file
submission_df.to_csv('submission_file.csv', index=False)

In [25]:
# Check the distribution of predictions
print(submission_df['Exited'].value_counts())

0.390048    3
0.102528    3
0.878370    2
0.197869    2
0.111878    2
           ..
0.013223    1
0.108499    1
0.194727    1
0.017475    1
0.176277    1
Name: Exited, Length: 109692, dtype: int64
