Imports

In [51]:
! pip install category_encoders
! pip install xgboost lightgbm catboost

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from category_encoders import CountEncoder
from collections import Counter

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.ensemble import RandomForestClassifier

import time

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

Reading the data

In [4]:
train_df = pd.read_csv('./drive/MyDrive/data/training.csv', parse_dates=['PurchDate'])
train_df = train_df.sort_values(by=['PurchDate'])

In [5]:
test_df = pd.read_csv('./drive/MyDrive/data/test.csv', parse_dates=['PurchDate'])
test_df = test_df.sort_values(by=['PurchDate'])

Filling the missing values

In [7]:
# train_df.isna().sum()

In [8]:
train_df.drop(columns=['PurchDate'], inplace=True)

cat_cols = train_df.select_dtypes(include=['object']).columns.to_list()
num_cols = train_df.select_dtypes(include=['float64', 'int64']).columns.to_list()

cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='median')

train_df[cat_cols] = cat_imputer.fit_transform(train_df[cat_cols])
train_df[num_cols] = num_imputer.fit_transform(train_df[num_cols])

In [9]:
# train_df.isna().sum()

Splitting

In [10]:
n_samples = len(train_df)
split_index = n_samples // 3

train = train_df.iloc[:split_index]
valid = train_df.iloc[split_index:2*split_index]
test = train_df.iloc[2*split_index:]

print(f'train: {train.shape}')
print(f'valid: {valid.shape}')
print(f'test: {test.shape}')

train: (24327, 33)
valid: (24327, 33)
test: (24329, 33)


Encoding

In [11]:
cat_cols = train.select_dtypes(include=['object']).columns.to_list()

count_encoder = CountEncoder()
count_encoder.fit(train[cat_cols])

train.loc[:, cat_cols] = count_encoder.transform(train[cat_cols])
valid.loc[:, cat_cols] = count_encoder.transform(valid[cat_cols])
test.loc[:, cat_cols] = count_encoder.transform(test[cat_cols])

---

Subsets

In [12]:
X_train = train.drop(columns=['IsBadBuy'])
y_train = train['IsBadBuy']

X_valid = valid.drop(columns=['IsBadBuy'])
y_valid = valid['IsBadBuy']

X_test = test.drop(columns=['IsBadBuy'])
y_test = test['IsBadBuy']

---

Custom CART Implementation

In [13]:
class Node:
    def __init__(self, depth=0, max_depth=None):
      self.depth = depth
      self.max_depth = max_depth
      self.feature_idx = None
      self.threshold = None
      self.left = None
      self.right = None
      self.prediction = None
      self.probs = None
      self.rng = np.random.default_rng(42)

    def is_leaf(self):
      return self.left is None and self.right is None

    def gini_impurity(self, y):
        if len(y) == 0:
            return 0
        probs = np.bincount(y.astype(int)) / len(y)
        return 1.0 - np.sum(probs ** 2)

    def std(self, y):
      return np.std(y) if len(y) > 0 else 0

    def best_split(self, X, y, criterion='gini'):
      n_features = X.shape[1]
      best_idx, best_thr, best_score = None, None, float('inf')

      for feature_idx in range(n_features):
        feature_values = X[:, feature_idx]
        thresholds = np.unique(feature_values)

        for threshold in thresholds:
          left_mask = feature_values <= threshold
          right_mask = ~left_mask

          if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
            continue

          y_left, y_right = y[left_mask], y[right_mask]

          if criterion == 'gini':
            score = (len(y_left) * self.gini_impurity(y_left) +
                     len(y_right) * self.gini_impurity(y_right)) / len(y)
          elif criterion == 'std':
            score = (len(y_left) * self.std(y_left) +
                     len(y_right) * self.std(y_right)) / len(y)
          else:
            raise ValueError("Unknown criterion")

          if score < best_score:
            best_idx, best_thr, best_score = feature_idx, threshold, score

      return best_idx, best_thr

    def extra_random_split(self, X, y, criterion='gini', max_features=None):
      n_features = X.shape[1]
      best_idx, best_thr, best_score = None, None, float('inf')

      if max_features == 'sqrt':
        k = int(np.sqrt(n_features))
      elif max_features == 'log2':
        k = int(np.log2(n_features))
      elif isinstance(max_features, int):
        k = max_features
      else:
        k = n_features

      features = self.rng.choice(n_features, k, replace=False)

      for feature_idx in features:
        feature_values = X[:, feature_idx]
        all_thresholds = np.unique(feature_values)
        thresholds = self.rng.choice(all_thresholds, size=min(5, len(all_thresholds)), replace=False)

        for threshold in thresholds:
          left_mask = feature_values <= threshold
          right_mask = ~left_mask

          if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
            continue

          y_left, y_right = y[left_mask], y[right_mask]

          if criterion == 'gini':
            score = (len(y_left) * self.gini_impurity(y_left) +
                     len(y_right) * self.gini_impurity(y_right)) / len(y)
          elif criterion == 'std':
            score = (len(y_left) * self.std(y_left) +
                     len(y_right) * self.std(y_right)) / len(y)
          else:
            raise ValueError('Unknown criterion!')

          if score < best_score:
            best_idx, best_thr, best_score = feature_idx, threshold, score

      return best_idx, best_thr

---

In [14]:
def gini_score(y_true, y_probs):
    auc = roc_auc_score(y_true, y_probs)
    return 2 * auc - 1

In [15]:
class DecisionTreeClassifierCustom:
    def __init__(self, max_depth=5, criterion='gini', extra_random=False, max_features=None):
        self.root = None
        self.max_depth = max_depth
        self.criterion = criterion
        self.extra_random = extra_random
        self.max_features = max_features

    def fit(self, X, y):
        if hasattr(X, 'values'):
            X = X.values
        if hasattr(y, 'values'):
          y = y.values
        y = y.astype(int)
        self.n_classes = int(np.max(y)) + 1
        self.root = self._grow_tree(X, y, depth=0)

    def _grow_tree(self, X, y, depth):
        node = Node(depth, self.max_depth)

        if len(set(y)) == 1 or depth >= self.max_depth:
          counts = np.bincount(y, minlength=self.n_classes)
          node.prediction = counts.argmax()
          node.probs = counts / counts.sum()
          return node

        if self.extra_random:
            feature_idx, threshold = node.extra_random_split(X, y, self.criterion, self.max_features)
        else:
            feature_idx, threshold = node.best_split(X, y)

        if feature_idx is None:
          counts = np.bincount(y, minlength=self.n_classes)
          node.prediction = counts.argmax()
          node.probs = counts / counts.sum()
          return node

        node.feature_idx = feature_idx
        node.threshold = threshold

        feature_values = X[:, feature_idx]
        left_mask = feature_values <= threshold
        right_mask = ~left_mask

        node.left = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
        node.right = self._grow_tree(X[right_mask], y[right_mask], depth + 1)

        return node

    def predict(self, X):
        if hasattr(X, 'values'):
            X = X.values
        return np.array([self._predict_one(x) for x in X])

    def predict_proba(self, X):
      if hasattr(X, 'values'):
        X = X.values
      return np.array([self._predict_proba_one(x) for x in X])

    def _predict_one(self, x):
      node = self.root
      while not node.is_leaf():
        if x[node.feature_idx] <= node.threshold:
          node = node.left
        else:
          node = node.right
      return node.prediction

    def _predict_proba_one(self, x):
      node = self.root
      while not node.is_leaf():
        if x[node.feature_idx] <= node.threshold:
          node = node.left
        else:
          node = node.right
      return node.probs

In [None]:
tree_clf_custom = DecisionTreeClassifierCustom(max_depth=5)
tree_clf_custom.fit(X_train, y_train)
y_probs = tree_clf_custom.predict_proba(X_valid)[:, 1]
print(f"Gini Score DecisionTreeClassifier (Custom): {gini_score(y_valid, y_probs):.4f}")

Gini Score DecisionTreeClassifier (Custom): 0.2828


In [None]:
tree_clf_custom = DecisionTreeClassifierCustom(max_depth=7)
tree_clf_custom.fit(X_train, y_train)
y_probs = tree_clf_custom.predict_proba(X_valid)[:, 1]
print(f"Gini Score DecisionTreeClassifier (Custom): {gini_score(y_valid, y_probs):.4f}")

Gini Score DecisionTreeClassifier (Custom): 0.2977


In [27]:
tree_clf_sklearn = DecisionTreeClassifier(max_depth=6)
tree_clf_sklearn.fit(X_train, y_train)
y_probs = tree_clf_sklearn.predict_proba(X_valid)[:, 1]

print(f"DecisionTreeClassifier Gini Score (Sklearn): {gini_score(y_valid, y_probs):.4f}")

DecisionTreeClassifier Gini Score (Sklearn): 0.2928


Sklearn DecisionTreeClassifier shows the same performance on the validation dataset

Sklearn DecisionTreeClassifier advantages:
1. Optimized Implementation. Sklearn uses Cython (C + Python) under the hood, making it highly optimized and significantly faster.
2. Best Split Algorithms. Sklearn uses efficient, well-tested algorithms to find the best splits.
3. Advanced Hyperparameter Support. Sklearn supports several regularization hyperparameters such as `min_samples_split`, `min_samples_leaf`, `max_leaf_nodes` hyperparameters. These help reduce overfitting and improve generalization.

In [16]:
class DecisionTreeRegressorCustom:
    def __init__(self, max_depth=5, extra_random=False, max_features=None):
        self.root = None
        self.max_depth = max_depth
        self.max_features = max_features
        self.extra_random = extra_random

    def fit(self, X, y):
        if hasattr(X, 'values'):
          X = X.values
        if hasattr(y, 'values'):
          y = y.values
        self.root = self._grow_tree(X, y, depth=0)

    def _grow_tree(self, X, y, depth):
        node = Node(depth, self.max_depth)

        if node.depth >= self.max_depth or len(set(y))== 1:
            node.prediction = np.mean(y)
            return node

        if self.extra_random:
            feature_idx, threshold = node.extra_random_split(X, y, 'std', self.max_features)
        else:
            feature_idx, threshold = node.best_split(X, y, criterion='std')

        if feature_idx is None:
          node.prediction = np.mean(y)
          return node

        node.feature_index = feature_idx
        node.threshold = threshold

        feature_values = X[:, feature_idx]
        left_mask = feature_values <= threshold
        right_mask = feature_values > threshold

        node.left = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
        node.right = self._grow_tree(X[right_mask], y[right_mask], depth + 1)

        return node

    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        return np.array([self._predict_one(x) for x in X])

    def _predict_one(self, x):
      node = self.root
      while not node.is_leaf():
        if x[node.feature_index] <= node.threshold:
          node = node.left
        else:
          node = node.right
      return node.prediction

In [17]:
reg = DecisionTreeRegressorCustom(max_depth=7)
reg.fit(X_train, y_train)
preds = reg.predict(X_valid)

print(f'DecisionTreeRegressor Custom:')
print(f"Mean Absolute Error: {mean_absolute_error(y_valid, preds):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_valid, preds):.4f}")
print(f"R-squared: {r2_score(y_valid, preds):.4f}")

DecisionTreeRegressor Custom:
Mean Absolute Error: 0.1887
Mean Squared Error: 0.1140
R-squared: -0.0072


In [18]:
reg = DecisionTreeRegressor(max_depth=7)
reg.fit(X_train, y_train)
preds = reg.predict(X_valid)

print(f'DecisionTreeRegressor Sklearn:')
print(f"Mean Absolute Error: {mean_absolute_error(y_valid, preds):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_valid, preds):.4f}")
print(f"R-squared: {r2_score(y_valid, preds):.4f}")

DecisionTreeRegressor Sklearn:
Mean Absolute Error: 0.1996
Mean Squared Error: 0.1196
R-squared: -0.0570


---

In [23]:
class RandomForestClassifierCustom:
  def __init__(self, n_estimators=10, max_depth=5, max_features='sqrt', random_state=42):
    self.n_estimators = n_estimators
    self.max_depth = max_depth
    self.max_features = max_features
    self.trees = []
    self.feature_indices = []
    self.rng = np.random.default_rng(random_state) # local random number generator for reproducibility, avoids affecting global random state

  def _sample_features(self, n_features):
    if self.max_features == 'sqrt':
      k = int(np.sqrt(n_features))
    elif self.max_features == 'log2':
      k = int(np.log2(n_features))
    elif isinstance(self.max_features, int):
      k = self.max_features
    else:
      k = n_features

    return self.rng.choice(n_features, k, replace=False)

  def fit(self, X, y):
    if hasattr(X, 'values'):
      X = X.values
    if hasattr(y, 'values'):
      y = y.values

    n_samples, n_features = X.shape
    self.n_classes = len(np.unique(y))
    self.trees = []
    self.feature_indices = []

    for _ in range(self.n_estimators):
      # Bootstrap sampling
      indices = self.rng.choice(n_samples, size=n_samples, replace=True)
      X_sample, y_sample = X[indices], y[indices]

      # Feature subset
      feature_idx = self._sample_features(n_features)
      self.feature_indices.append(feature_idx)
      X_sample_sub = X_sample[:, feature_idx]

      tree = DecisionTreeClassifierCustom(max_depth=self.max_depth, criterion='gini', extra_random=True)
      # tree = DecisionTreeClassifierCustom(max_depth=self.max_depth, criterion='gini')
      tree.fit(X_sample_sub, y_sample)
      self.trees.append(tree)

  def predict_proba(self, X):
    if hasattr(X, 'values'):
      X = X.values

    preds = np.zeros((X.shape[0], self.n_classes))
    for tree, feat_idx in zip(self.trees, self.feature_indices):
      preds += tree.predict_proba(X[:, feat_idx])
    return preds / self.n_estimators

  def predict(self, X):
      return np.argmax(self.predict_proba(X), axis=1)

In [26]:
start = time.time()

rf = RandomForestClassifierCustom(n_estimators=50, max_depth=3, max_features='sqrt', random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict_proba(X_valid)[:, 1]

print(f"RandomForestClassifier Custom Gini Score : {gini_score(y_valid, preds):.4f}   |   Time: {time.time() - start}:2.f")

RandomForestClassifier Custom Gini Score : 0.3153   |   Time: 29.438709020614624:2.f


In [27]:
start = time.time()

rf = RandomForestClassifierCustom(n_estimators=25, max_depth=5, max_features='sqrt', random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict_proba(X_valid)[:, 1]

print(f"RandomForestClassifier Custom Gini Score : {gini_score(y_valid, preds):.4f}   |   Time: {time.time() - start}:2.f")

RandomForestClassifier Custom Gini Score : 0.3231   |   Time: 17.5566303730011:2.f


---

In [31]:
class GBDTClassifierCustom:
    def __init__(self, n_estimators=25, learning_rate=0.1, max_depth=5, max_features='sqrt', random_state=42):
      self.n_estimators = n_estimators
      self.learning_rate = learning_rate
      self.max_depth = max_depth
      self.max_features = max_features
      self.rng = np.random.default_rng(random_state)
      self.trees = []
      self.feature_indices = []

    def _sigmoid(self, x):
      return 1 / (1 + np.exp(-x))

    def _gradient(self, y_true, raw_pred):
      return self._sigmoid(raw_pred) - y_true

    def _sample_features(self, n_features):
      if self.max_features == 'sqrt':
        k = int(np.sqrt(n_features))
      elif self.max_features == 'log2':
        k = int(np.log2(n_features))
      elif isinstance(self.max_features, int):
        k = self.max_features
      else:
        k = n_features
      return self.rng.choice(n_features, k, replace=False)

    def fit(self, X, y):
      if hasattr(X, 'values'):
        X = X.values
      if hasattr(y, 'values'):
        y = y.values
      y = y.astype(int)

      self.trees = []
      self.feature_indices = []
      n_samples, n_features = X.shape
      raw_preds = np.zeros(n_samples)

      for _ in range(self.n_estimators):
        grad = self._gradient(y, raw_preds)

        feature_idx = self._sample_features(n_features)
        self.feature_indices.append(feature_idx)

        X_sub = X[:, feature_idx]
        tree = DecisionTreeRegressorCustom(max_depth=self.max_depth, extra_random=True)
        # tree = DecisionTreeRegressorCustom(max_depth=self.max_depth
        tree.fit(X_sub, -grad)

        raw_preds += self.learning_rate * tree.predict(X_sub)
        self.trees.append(tree)

    def predict_proba(self, X):
        if hasattr(X, 'values'):
            X = X.values

        raw_preds = np.zeros(X.shape[0])

        for tree, feat_idx in zip(self.trees, self.feature_indices):
            X_sub = X[:, feat_idx]
            raw_preds += self.learning_rate * tree.predict(X_sub)

        return self._sigmoid(raw_preds)

    def predict(self, X):
        return (self.predict_proba(X) >= 0.5).astype(int)


In [54]:
start = time.time()

gbdt = GBDTClassifierCustom(
    n_estimators=10,
    learning_rate=0.5,
    max_depth=7,
    random_state=42
)

gbdt.fit(X_train, y_train)
y_pred_proba = gbdt.predict_proba(X_valid)

print(f"GBDTClassifier Custom Gini Score: {gini_score(y_valid, y_pred_proba):.4f}     |   Time: {time.time()-start:.2f}s")

GBDTClassifier Custom Gini Score: 0.2862     |   Time: 14.38s


In [53]:
start = time.time()

gbdt = GBDTClassifierCustom(
    n_estimators=25,
    learning_rate=0.25,
    max_depth=5,
    random_state=42
)

gbdt.fit(X_train, y_train)
y_pred_proba = gbdt.predict_proba(X_valid)

print(f"GBDTClassifier Custom Gini Score: {gini_score(y_valid, y_pred_proba):.4f}     |   Time: {time.time()-start:.2f}s")

GBDTClassifier Custom Gini Score: 0.3313     |   Time: 27.44s


In [55]:
start = time.time()

gbdt = GBDTClassifierCustom(
    n_estimators=50,
    learning_rate=0.1,
    max_depth=4,
    random_state=42
)

gbdt.fit(X_train, y_train)
y_pred_proba = gbdt.predict_proba(X_valid)

print(f"GBDTClassifier Custom Gini Score: {gini_score(y_valid, y_pred_proba):.4f}     |   Time: {time.time()-start:.2f}s")

GBDTClassifier Custom Gini Score: 0.3233     |   Time: 39.64s


---

In [35]:
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)
dvalid = xgb.DMatrix(X_valid.values, label=y_valid.values)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.05,
    'booster': 'dart',
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

start = time.time()

xgb_model = xgb.train(params, dtrain, num_boost_round=200)
y_pred_xgb = xgb_model.predict(dvalid)
print(f'XGBoost Gini Score: {gini_score(y_valid, y_pred_xgb)}   |   Time: {time.time()-start}')

XGBoost Gini Score: 0.36813512037354035   |   Time: 46.15733361244202


In [36]:
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)
dvalid = xgb.DMatrix(X_valid.values, label=y_valid.values)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.05,
    'booster': 'dart',
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

start = time.time()

xgb_model = xgb.train(params, dtrain, num_boost_round=500)
y_pred_xgb = xgb_model.predict(dvalid)
print(f'XGBoost Gini Score: {gini_score(y_valid, y_pred_xgb)}   |   Time: {time.time()-start}')

XGBoost Gini Score: 0.3589898328744803   |   Time: 254.14853525161743


In [None]:
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)
dvalid = xgb.DMatrix(X_valid.values, label=y_valid.values)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.05,
    'booster': 'dart',
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

start = time.time()

xgb_model = xgb.train(params, dtrain, num_boost_round=1000)
y_pred_xgb = xgb_model.predict(dvalid)
print(f'XGBoost Gini Score: {gini_score(y_valid, y_pred_xgb)}   |   Time: {time.time()-start}')

XGBoost Gini Score: 0.32995260494770196   |   Time: 903.3257081508636


DART (Dropouts meet Multiple Additive Regression Trees) randomly drops trees during training for regularization

In [95]:
train_data = lgb.Dataset(X_train.values, label=y_train.values)
valid_data = lgb.Dataset(X_valid.values, label=y_valid.values)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_depth': 6,
    'num_leaves': 12,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42,
    'verbose': -1
}

start = time.time()
model_lgb = lgb.train(params, train_data, valid_sets=[valid_data], num_boost_round=300)
y_pred_lgb = model_lgb.predict(X_valid.values)

print(f'LightGBM Gini Score: {gini_score(y_valid, y_pred_lgb)}   |   Time: {time.time()-start}:.2f')

LightGBM Gini Score: 0.36357754538205134   |   Time: 3.1782729625701904:.2f


In [40]:
cat_features = X_train.select_dtypes(include=['object']).columns.to_list()
cat_features_idx = [X_train.columns.get_loc(col) for col in cat_features]

X_train[cat_features] = X_train[cat_features].astype(str)
X_valid[cat_features] = X_valid[cat_features].astype(str)

model_cat = CatBoostClassifier(
    iterations=200,
    learning_rate=0.05,
    depth=6,
    loss_function='Logloss',
    early_stopping_rounds=50,
    eval_metric='AUC',
    cat_features=cat_features_idx,
    verbose=50,
    random_seed=42
)

start = time.time()

model_cat.fit(X_train, y_train, eval_set=(X_valid, y_valid))
y_pred_cat = model_cat.predict_proba(X_valid)[:, 1]

print(f'\nCatBoost Gini Score: {gini_score(y_valid, y_pred_cat)}   |   Time: {time.time()-start}')

0:	test: 0.6151700	best: 0.6151700 (0)	total: 289ms	remaining: 57.4s
50:	test: 0.6689684	best: 0.6698658 (46)	total: 7.51s	remaining: 21.9s
100:	test: 0.6660008	best: 0.6706475 (70)	total: 11.9s	remaining: 11.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.670647502
bestIteration = 70

Shrink model to first 71 iterations.

CatBoost Gini Score: 0.3412950039883671   |   Time: 16.144532680511475


LightGBM performed better than other GBM models. LightGBM usually is very fast on large datasets with numerical values. It uses histogram-based learning (faster and more memory-efficient). It uses leaf-wise tree growth (with depth limits), which often gives better accuracy. It also supports native categorical feature handling (with `categorical_feature` argument).

LightGBM
- Uses histogram-based learning: buckets continuous features into discrete bins, reducing memory usage and speeding up computation.
- Grows trees leaf-wise instead of level-wise.
- Leads to deeper trees and better accuracy.
- Supports native categorical feature handling (with categorical_feature argument).

CatBoost:
- Best handling of categorical features (no preprocessing needed).
- Used a method called "ordered boosting" to encode categorical features without data leakage.
- Internally converts categorical values to numbers based on statistics (e.g., average target value) using **permutation-driven encoding**

XGBoost
- Uses either exactly greedy or approximate histogram-based methods for split finding.
- Has options like DART = Dropouts meet Multiple Additive Regression Trees.
- Drops random trees during training (like dropout in neural nets).
- Prevents overfitting and encourages ensemble diversity.



---

In [96]:
y_pred_train = model_lgb.predict(X_train.values)
y_pred_valid = model_lgb.predict(X_valid.values)
y_pred_test = model_lgb.predict(X_test.values)

print(f"Train Gini: {gini_score(y_train, y_pred_train):.4f}")
print(f"Valid Gini: {gini_score(y_valid, y_pred_valid):.4f}")
print(f"Test Gini:  {gini_score(y_test, y_pred_test):.4f}")

Train Gini: 0.6512
Valid Gini: 0.3636
Test Gini:  0.3702


Model (LightGBM) performs significantly better on the training set than on validation or test datasets.

This large gap between training and validation Gini scores indicates that the model is overfitting. It has likely memorized patterns in the training data that do not generalize well to unseen data.

However, validation and test Gini scores are close to each other, which is a positive sign. It suggests that:

- The validation set is representative of the test set.
- The model's performance is consistent on unseen data.
- The model is not overfitting to the validation set specifically.

The models shows moderate overfitting: it performs well on the training data but worse on validation and test data, though the validation/test consistency suggests it still generalized reasonably well.

---

In [97]:
start = time.time()

extra_tree_clf = DecisionTreeClassifierCustom(
    max_depth=5,
    criterion='gini',
    extra_random=True,
)

extra_tree_clf.fit(X_train, y_train)
y_pred = extra_tree_clf.predict_proba(X_valid)[:, 1]

print(f"Extra Tree Classifier Gini Score: {gini_score(y_valid, y_pred)}    |   Time: {time.time() - start}:.2f")

Extra Tree Classifier Gini Score: 0.29622284031841706    |   Time: 6.546526908874512:.2f
