In [2]:
# Load dataset
import pandas as pd 
import numpy as np

Train_set = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
print(Train_set.head())

   id  CustomerId         Surname  CreditScore Geography Gender   Age  Tenure  \
0   0    15674932  Okwudilichukwu          668    France   Male  33.0       3   
1   1    15749177   Okwudiliolisa          627    France   Male  33.0       1   
2   2    15694510           Hsueh          678    France   Male  40.0      10   
3   3    15741417             Kao          581    France   Male  34.0       2   
4   4    15766172       Chiemenam          716     Spain   Male  33.0       5   

     Balance  NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  \
0       0.00              2        1.0             0.0        181449.97   
1       0.00              2        1.0             1.0         49503.50   
2       0.00              2        1.0             0.0        184866.69   
3  148882.54              1        1.0             1.0         84560.88   
4       0.00              2        1.0             1.0         15068.83   

   Exited  
0       0  
1       0  
2       0  
3       0  
4 

In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 1. Remove rows with missing values in any column
Train_set = Train_set.dropna()

# Label encoding the categorical features
categorical_features = ["Surname", "Geography", "Gender"]
label_encoder_X = LabelEncoder()

for feature in categorical_features:
    Train_set[feature] = label_encoder_X.fit_transform(Train_set[feature])

# Scaling numerical features
numerical_features = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"]
scaler = StandardScaler()

for feature in numerical_features:
    Train_set[feature] = scaler.fit_transform(Train_set[feature].values.reshape(-1, 1))

Train_set.drop(["id", "CustomerId"], axis=1, inplace=True)

print(Train_set.head())

   Surname  CreditScore  Geography  Gender       Age    Tenure   Balance  \
0     1924     0.144135          0       1 -0.578074 -0.719973 -0.883163   
1     1925    -0.367706          0       1 -0.578074 -1.432694 -0.883163   
2     1178     0.268974          0       1  0.211354  1.774548 -0.883163   
3     1299    -0.941966          0       1 -0.465299 -1.076334  1.486918   
4      467     0.743362          2       1 -0.578074 -0.007253 -0.883163   

   NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0       0.814298        1.0             0.0         1.369486       0  
1       0.814298        1.0             1.0        -1.254085       0  
2       0.814298        1.0             0.0         1.437422       0  
3      -1.013348        1.0             1.0        -0.557018       0  
4       0.814298        1.0             1.0        -1.938770       0  


In [4]:
# 2. Split dataset into training and validation sets
from sklearn.model_selection import train_test_split

X = Train_set.drop(["Exited"], axis=1).values
y = Train_set["Exited"].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

print("X_train shape: ", X_train.shape)

X_train shape:  (132027, 11)


In [28]:
# create dataloader
import torch
from torch.utils.data import TensorDataset, DataLoader

batch_size = 32

train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)

val_loader_iter = iter(val_loader)
data, label = next(val_loader_iter)
print(data[1])

tensor([ 1.2990e+03,  6.6846e-01,  0.0000e+00,  1.0000e+00, -4.6530e-01,
        -1.0763e+00, -8.8316e-01,  8.1430e-01,  1.0000e+00,  1.0000e+00,
         9.7180e-01], dtype=torch.float64)


In [6]:
# Build the model

import torch.nn as nn
import torch.nn.functional as F

class FeedForward(nn.Module):
    def __init__(self, num_hidden):
        super(FeedForward, self).__init__()
        self.num_hidden = num_hidden
        
        self.net = nn.Sequential(
            nn.Linear(self.num_hidden, self.num_hidden),
            nn.ReLU(),
        )
    
    def forward(self, x):
        return self.net(x)
    
class Net(nn.Module):
    def __init__(self, input_size, hidden_layers, hidden_size, output_size):
        super(Net, self).__init__()
        self.input_size = input_size
        self.hidden_layers = hidden_layers
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.fc1 = nn.Linear(self.input_size, self.hidden_size)
        self.blocks = nn.ModuleList([FeedForward(self.hidden_size) for _ in range(self.hidden_layers)])
        self.fc2 = nn.Linear(self.hidden_size, self.output_size)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        for block in self.blocks:
            x = block(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [7]:
# Defining hyperparameters
input_size = X_train.shape[1]
hidden_layers = 3
hidden_size = 64
output_size = 1
alpha = 1e-3
epochs = 200

# Initialize the model
model = Net(input_size, hidden_layers, hidden_size, output_size)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=alpha)

In [8]:
# Train the model

for epoch in range(epochs):
    train_loss = 0.0

    model.train()
    
    for data, target in train_loader:
        optimizer.zero_grad()
        output = model.forward(data.float())
        loss = criterion(output.squeeze(), target.float())
        loss.backward()
        optimizer.step()

        train_loss += loss.item()*data.size(0)

    train_loss = train_loss/len(train_loader.dataset)

    if epoch % 10 == 0:
        print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))

Epoch: 1 	Training Loss: 0.525553
Epoch: 11 	Training Loss: 0.517186
Epoch: 21 	Training Loss: 0.516045
Epoch: 31 	Training Loss: 0.516529
Epoch: 41 	Training Loss: 0.515252
Epoch: 51 	Training Loss: 0.480728
Epoch: 61 	Training Loss: 0.355956
Epoch: 71 	Training Loss: 0.350045
Epoch: 81 	Training Loss: 0.345312
Epoch: 91 	Training Loss: 0.340246
Epoch: 101 	Training Loss: 0.337265
Epoch: 111 	Training Loss: 0.334171
Epoch: 121 	Training Loss: 0.332686
Epoch: 131 	Training Loss: 0.332113
Epoch: 141 	Training Loss: 0.330289
Epoch: 151 	Training Loss: 0.329825
Epoch: 161 	Training Loss: 0.328108
Epoch: 171 	Training Loss: 0.327675
Epoch: 181 	Training Loss: 0.327421
Epoch: 191 	Training Loss: 0.325939


In [11]:
torch.save(model.state_dict(), "churn_model_weights.pt")

In [35]:
model.eval()
correct = 0
total_samples = 0

with torch.no_grad():
    for data, label in val_loader:
        # Forward pass
        output = model(data.float())
        
        # Threshold the output and convert to the same type as target
        pred = (output > 0.5).float()

        # Update correct count
        correct += (pred == label.view_as(pred)).sum().item()
        
        # Update total samples count
        total_samples += data.size(0)

# Calculate accuracy
accuracy = correct / total_samples * 100.0
print(f'Validation Accuracy: {accuracy:.4f}%')

Validation Accuracy: 85.4910%


In [65]:
Test_set = pd.read_csv("/kaggle/input/playground-series-s4e1/test.csv")

# Test set preprocessing

Test_set = Test_set.dropna()
print(len(Test_set))
# Label encoding the categorical features
categorical_features = ["Surname", "Geography", "Gender"]
label_encoder_X = LabelEncoder()

for feature in categorical_features:
    Test_set[feature] = label_encoder_X.fit_transform(Test_set[feature])

# Scaling numerical features
numerical_features = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"]
scaler = StandardScaler()

for feature in numerical_features:
    Test_set[feature] = scaler.fit_transform(Test_set[feature].values.reshape(-1, 1))
    
Test_set = Test_set.drop(["CustomerId"], axis=1)
Test_ids = Test_set["id"]
Test_set = Test_set.drop(["id"], axis=1)
print(Test_set.head())


# create dataloader
test_X = Test_set
test_X = test_X.values

test_X = TensorDataset(torch.from_numpy(test_X))
test_loader = DataLoader(test_X, shuffle=False, batch_size=batch_size)
print(len(test_loader))

110023
   Surname  CreditScore  Geography  Gender       Age    Tenure   Balance  \
0     1482    -0.878176          0       0 -1.706504 -1.067887 -0.881274   
1     1812     0.329567          0       0  0.888990 -1.067887 -0.881274   
2     1246    -0.006609          0       0 -0.465181  0.713922 -0.881274   
3     1832     0.304665          0       1 -0.239486  1.070284 -0.881274   
4     1079     1.188684          1       1 -0.013791  1.783008  1.050038   

   NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
0       0.820030        0.0             1.0         0.967874  
1      -1.015806        1.0             0.0        -0.790939  
2       0.820030        1.0             0.0         0.528413  
3      -1.015806        1.0             0.0         0.032150  
4      -1.015806        1.0             0.0         0.539331  
3439
test_ids shape:  110023


In [66]:
predictions = []
model.eval()

with torch.no_grad():
    for data in test_loader:
        outputs = model.forward(data[0].float())
        predictions.extend(outputs.squeeze().tolist())

print(len(predictions))

110023
165035


In [68]:
import pandas as pd


df = pd.DataFrame({"id": Test_ids, "Exited": predictions})

# Save to CSV
df.to_csv("predictions.csv", index=False)