#### Import

In [115]:
%pip install category_encoders



In [116]:
import os, time

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from category_encoders import CountEncoder
from sklearn.metrics import roc_auc_score

from sklearn.neural_network import MLPClassifier

import torch
import torch.nn as nn
import torch.optim as optim

#### Preprocessing

In [117]:
train_df = pd.read_csv('./drive/MyDrive/data/ML_8/train.csv', parse_dates=['PurchDate'])
test_df = pd.read_csv('./drive/MyDrive/data/ML_8/test.csv', parse_dates=['PurchDate'])

train_df= train_df.sort_values(by=['PurchDate']).reset_index(drop=True)
test_df = test_df.sort_values(by=['PurchDate']).reset_index(drop=True)

In [118]:
dates = train_df['PurchDate']

date_split_1 = dates.quantile(1/3)
date_split_2 = dates.quantile(2/3)

train = train_df[train_df['PurchDate'] <= date_split_1].copy()
valid = train_df[(train_df['PurchDate'] > date_split_1) & (train_df['PurchDate'] <= date_split_2)].copy()
test = train_df[train_df['PurchDate'] > date_split_2].copy()

In [119]:
cat_cols = train_df.select_dtypes(include=['object']).columns.to_list()
num_cols = train_df.select_dtypes(include=['float64', 'int64']).columns.to_list()

cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='median')

train[cat_cols] = cat_imputer.fit_transform(train[cat_cols])
valid[cat_cols] = cat_imputer.transform(valid[cat_cols])
test[cat_cols] = cat_imputer.transform(test[cat_cols])

train[num_cols] = num_imputer.fit_transform(train[num_cols])
valid[num_cols] = num_imputer.transform(valid[num_cols])
test[num_cols] = num_imputer.transform(test[num_cols])

In [120]:
for df in [train, valid, test]:
    df.drop(columns=['PurchDate'], inplace=True)

In [121]:
encoder = CountEncoder()

train.loc[:, cat_cols] = encoder.fit_transform(train[cat_cols])
valid.loc[:, cat_cols] = encoder.transform(valid[cat_cols])
test.loc[:, cat_cols] = encoder.transform(test[cat_cols])

In [122]:
X_train, y_train = train.drop(columns=['IsBadBuy']), train['IsBadBuy']
X_valid, y_valid = valid.drop(columns=['IsBadBuy']), valid['IsBadBuy']
X_test, y_test = test.drop(columns=['IsBadBuy']), test['IsBadBuy']

In [123]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_valid_scaled = scaler.transform(X_valid)

y_train = y_train.astype(int).values.reshape(-1, 1)
y_valid = y_valid.astype(int).values.reshape(-1, 1)
y_test = y_test.astype(int).values.reshape(-1, 1)

#### MLP

In [124]:
class MLP:
    def __init__(self, n_hidden=100, activation='tanh', learning_rate=0.01, n_epochs=10, batch_size=32, optimizer='sgd', verbose=False):
        self.n_hidden = n_hidden
        self.activation = activation
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.verbose = verbose

    def _initialize_weights(self, input_dim):
      self.W1 = 0.01 * np.random.randn(input_dim, self.n_hidden)
      self.b1 = np.zeros((1, self.n_hidden))
      self.W2 = 0.01 * np.random.randn(self.n_hidden, 1)
      self.b2 = np.zeros((1, 1))

      if self.optimizer == 'adam':
        self.mW1 = np.zeros_like(self.W1)
        self.mb1 = np.zeros_like(self.b1)
        self.mW2 = np.zeros_like(self.W2)
        self.mb2 = np.zeros_like(self.b2)
        self.vW1 = np.zeros_like(self.W1)
        self.vb1 = np.zeros_like(self.b1)
        self.vW2 = np.zeros_like(self.W2)
        self.vb2 = np.zeros_like(self.b2)
        self.t = 0 # Time step for bias correction

    def _sigmoid(self, z):
      z = np.clip(z, -500, 500)
      return 1 / (1 + np.exp(-z))

    def _binary_cross_entropy(self, y_true, y_pred):
        y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    def _forward(self, X):
      self.Z1 = X @ self.W1 + self.b1
      self.A1 = self._activation(self.Z1)
      self.Z2 = self.A1 @ self.W2 + self.b2
      self.A2 = self._sigmoid(self.Z2)
      return self.A2

    def _backward(self, X, y, output):
      m = X.shape[0]
      dZ2 = output - y
      dW2 = self.A1.T @ dZ2 / m
      db2 = np.sum(dZ2, axis=0, keepdims=True) / m

      dA1 = dZ2 @ self.W2.T
      dZ1 = dA1 * self._activation_derivative(self.Z1)
      dW1 = X.T @ dZ1 / m
      db1 = np.sum(dZ1, axis=0, keepdims=True) / m
      return dW1, db1, dW2, db2

    def _activation(self, z):
      if self.activation == 'tanh':
        return np.tanh(z)
      elif self.activation == 'sigmoid':
        return self._sigmoid(z)
      elif self.activation == 'relu':
        return np.maximum(0, z)
      elif self.activation == 'cos':
        return np.cos(z)
      else:
        raise ValueError("Unknown activation")

    def _activation_derivative(self, z):
      if self.activation == 'tanh':
        return 1 - np.tanh(z) ** 2
      elif self.activation == 'sigmoid':
        sig = 1 / (1 + np.exp(-z))
        return sig * (1 - sig)
      elif self.activation == 'relu':
        return (z > 0).astype(float)
      elif self.activation == 'cos':
        return -np.sin(z)
      else:
        raise ValueError("Unknown activation")

    def _update_parameters(self, dW1, db1, dW2, db2):
      if self.optimizer == 'sgd':
        self.W1 -= self.learning_rate * dW1
        self.b1 -= self.learning_rate * db1
        self.W2 -= self.learning_rate * dW2
        self.b2 -= self.learning_rate * db2
      elif self.optimizer == 'adam':
        beta1, beta2, epsilon = 0.9, 0.999, 1e-8
        self.t += 1
        for param, dparam, m, v in zip([self.W1, self.b1, self.W2, self.b2],
                                        [dW1, db1, dW2, db2],
                                        [self.mW1, self.mb1, self.mW2, self.mb2],
                                        [self.vW1, self.vb1, self.vW2, self.vb2]):
          m[:] = beta1 * m + (1 - beta1) * dparam
          v[:] = beta2 * v + (1 - beta2) * (dparam ** 2)
          m_hat = m / (1 - beta1 ** self.t)
          v_hat = v / (1 - beta2 ** self.t)
          param -= self.learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)

    def fit(self, X, y, X_val=None, y_val=None):
      n_samples, n_features = X.shape
      self._initialize_weights(n_features)

      for epoch in range(self.n_epochs):
        if self.verbose and X_val is not None and y_val is not None:
          train_pred = self._forward(X)
          train_loss = self._binary_cross_entropy(y, train_pred)
          val_pred = self._forward(X_val)
          val_loss = self._binary_cross_entropy(y_val, val_pred)
          print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

        idx = np.random.permutation(n_samples)
        X_shuffled, y_shuffled = X[idx], y[idx]
        for i in range(0, n_samples, self.batch_size):
          X_batch = X_shuffled[i:i + self.batch_size]
          y_batch = y_shuffled[i:i + self.batch_size]

          output = self._forward(X_batch)
          dW1, db1, dW2, db2 = self._backward(X_batch, y_batch, output)
          self._update_parameters(dW1, db1, dW2, db2)

    def predict_proba(self, X):
      proba_1 = self._forward(X).flatten()
      return np.vstack([1 - proba_1, proba_1]).T

    def predict(self, X):
      return (self._forward(X) >= 0.5).astype(int).flatten()

In [125]:
def gini_score(y_true, y_prob):
    auc = roc_auc_score(y_true, y_prob)
    return 2 * auc - 1

In [126]:
model = MLP(
    n_hidden=128,
    activation='tanh',
    learning_rate=0.01,
    optimizer='adam',
    n_epochs=8,
    batch_size=64,
)

model.fit(X_train_scaled, y_train, X_valid_scaled, y_valid)
y_valid_pred_custom = model.predict_proba(X_valid_scaled)[:, 1]

print(f"Custom MLP Gini (Adam optimizer): {gini_score(y_valid, y_valid_pred_custom):.4f}")

Custom MLP Gini (Adam optimizer): 0.3186


In [127]:
model = MLP(
    n_hidden=128,
    activation='sigmoid',
    learning_rate=0.01,
    optimizer='adam',
    n_epochs=8,
    batch_size=64
)

model.fit(X_train_scaled, y_train)
y_valid_pred_custom = model.predict_proba(X_valid_scaled)[:, 1]

print(f"Custom MLP Gini (Adam optimizer): {gini_score(y_valid, y_valid_pred_custom):.4f}")

Custom MLP Gini (Adam optimizer): 0.3506


In [128]:
model = MLP(
    n_hidden=128,
    activation='tanh',
    learning_rate=0.01,
    optimizer='sgd',
    n_epochs=8,
    batch_size=64
)

model.fit(X_train_scaled, y_train)
y_valid_pred_custom = model.predict_proba(X_valid_scaled)[:, 1]

print(f"Custom MLP Gini (SGD optimizer): {gini_score(y_valid, y_valid_pred_custom):.4f}")

Custom MLP Gini (SGD optimizer): 0.3185


In [129]:
model = MLP(
    n_hidden=128,
    activation='sigmoid',
    learning_rate=0.01,
    optimizer='sgd',
    n_epochs=8,
    batch_size=64
)

model.fit(X_train_scaled, y_train)
y_valid_pred_custom = model.predict_proba(X_valid_scaled)[:, 1]

print(f"Custom MLP Gini (SGD optimizer): {gini_score(y_valid, y_valid_pred_custom):.4f}")

Custom MLP Gini (SGD optimizer): 0.2823


In [130]:
sk_model = MLPClassifier(
    hidden_layer_sizes=(128,),
    activation='logistic',
    solver='adam',
    learning_rate_init=0.01,
    max_iter=20,
    random_state=42
)

sk_model.fit(X_train_scaled, y_train.ravel())
y_valid_pred_sklearn = sk_model.predict_proba(X_valid_scaled)[:, 1]

print(f"Sklearn MLP Gini:, {gini_score(y_valid, y_valid_pred_sklearn):.4f}")

Sklearn MLP Gini:, 0.3625




In [131]:
activations = ['tanh', 'sigmoid', 'relu', 'cos']

for act in activations:
    model = MLP(n_hidden=128, activation=act, learning_rate=0.01, n_epochs=12, optimizer='adam')
    model.fit(X_train_scaled, y_train)
    y_valid_pred = model.predict_proba(X_valid_scaled)[:, 1]
    gini = gini_score(y_valid, y_valid_pred)
    print(f"Activation: {act} → Gini: {gini:.4f}")

Activation: tanh → Gini: 0.3094
Activation: sigmoid → Gini: 0.3455
Activation: relu → Gini: 0.3624
Activation: cos → Gini: 0.2438


In [132]:
class TorchMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=100):
      super(TorchMLP, self).__init__()
      self.model = nn.Sequential(
        nn.Linear(input_dim, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, 1),
        nn.Sigmoid()
      )

    def forward(self, x):
        return self.model(x)

    def fit(self, X_train, y_train, X_valid, y_valid, epochs=10, batch_size=64, lr=0.001, verbose=False):
      self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
      criterion = nn.BCELoss()
      optimizer = optim.Adam(self.parameters(), lr=lr)

      X_train = torch.tensor(X_train, dtype=torch.float32).to(self.device)
      y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1).to(self.device)
      X_valid = torch.tensor(X_valid, dtype=torch.float32).to(self.device)
      y_valid = torch.tensor(y_valid, dtype=torch.float32).view(-1, 1).to(self.device)

      for epoch in range(epochs):
        self.train()
        permutation = torch.randperm(X_train.size()[0])

        epoch_loss = 0

        for i in range(0, X_train.size()[0], batch_size):
          indices = permutation[i:i+batch_size]
          X_batch, y_batch = X_train[indices], y_train[indices]

          optimizer.zero_grad()
          outputs = self(X_batch)
          loss = criterion(outputs, y_batch)
          loss.backward()
          optimizer.step()

          epoch_loss += loss.item() * X_batch.size(0)

        train_loss = epoch_loss / X_train.size(0)

        self.eval()
        with torch.no_grad():
          val_pred = self(X_valid)
          val_loss = criterion(val_pred, y_valid)
        if verbose:
          print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss.item():.4f}")

      self.eval()
      with torch.no_grad():
          y_valid_pred = torch.sigmoid(self(X_valid)).cpu().numpy().flatten()

      return y_valid_pred

In [133]:
model = TorchMLP(input_dim=X_train.shape[1], hidden_dim=64)
y_pred = model.fit(X_train_scaled, y_train, X_valid_scaled, y_valid, epochs=8, lr=0.001)

print(f"Gini score (TorchMLP): {gini_score(y_valid, y_pred):.4f}")

Gini score (TorchMLP): 0.3628


In [134]:
def evaluate_ginis(model, X_train, y_train, X_val, y_val, X_test, y_test):
    model.eval()
    with torch.no_grad():
      X_train = torch.tensor(X_train, dtype=torch.float32)
      X_val   = torch.tensor(X_val, dtype=torch.float32)
      X_test  = torch.tensor(X_test, dtype=torch.float32)

      y_train_pred = model(X_train).numpy().flatten()
      y_val_pred   = model(X_val).numpy().flatten()
      y_test_pred  = model(X_test).numpy().flatten()

    train_gini = gini_score(y_train, y_train_pred)
    valid_gini = gini_score(y_val, y_val_pred)
    test_gini  = gini_score(y_test, y_test_pred)

    print(f"Training Gini:     {train_gini:.4f}")
    print(f"Validation Gini:  {valid_gini:.4f}")
    print(f"Test Gini:        {test_gini:.4f}")

In [136]:
evaluate_ginis(model, X_train_scaled, y_train, X_valid_scaled, y_valid, X_test_scaled, y_test)

Training Gini:     0.4405
Validation Gini:  0.3628
Test Gini:        0.4268


Interpretation:
- The model fits the training data moderately well (Train Gini = 0.4460).
- A slight drop on the validation set (Valid Gini = 0.3653) is expected and shows generalization.
- Test Gini (0.4241) is surprisingly higher than validation — this suggests the model generalizes well to unseen data and is not significantly overfitting.

Conclusion:
- Model shows healthy generalization.
- No clear signs of overfitting.
- Further gains may be possible by tuning model capacity or using advanced regularization.