In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score


In [2]:
class BoostedTree:
    def __init__(self, data, data_indices, gradients, hessian, reg_param, max_depth, param):
        self.data = data
        self.data_indices = data_indices
        self.gradients= gradients.values if isinstance(gradients, pd.Series) else gradients
        self.hessian = hessian.values if isinstance(hessian, pd.Series) else hessian
        self.reg_param = reg_param
        self.param = param
        self.weight = -self.gradients[data_indices].sum() / (self.hessian[data_indices].sum() + self.reg_param)
        self.threshold = 0
        self.split_score = 0.0
        self.feature_col_index = -1
        self.max_depth = max_depth
        self._build_tree()

    def _build_tree(self):
        if self.max_depth <= 0:
            return #nothing to build here

        for fid in range(self.data.shape[1]):
            self._find_the_split(fid)

        feature_data = self.data[self.data_indices, self.feature_col_index]
        left_indices = np.nonzero(feature_data <= self.threshold)[0]
        right_indices = np.nonzero(feature_data > self.threshold)[0]
        

        self.left = BoostedTree(self.data, left_indices, self.gradients, self.hessian, self.reg_param, self.max_depth-1,self. param)
        self.right = BoostedTree(self.data, right_indices, self.gradients, self.hessian, self.reg_param, self.max_depth-1, self.param)


    def _find_the_split(self, feature_col_idx):
        feature_data = self.data[self.data_indices, feature_col_idx]
        grad = self.gradients[self.data_indices]
        hess = self.hessian[self.data_indices]

        sorted_data_index_based_on_feature_data = np.argsort(feature_data)
        sorted_feature_data = self.data[sorted_data_index_based_on_feature_data, feature_col_idx]
        sorted_grad = self.gradients[sorted_data_index_based_on_feature_data]
        sorted_hess = self.hessian[sorted_data_index_based_on_feature_data]

        right_grad = sorted_grad.sum()
        right_hess = sorted_hess.sum()
        left_grad = 0.0
        left_hess = 0.0
        total_grad = sorted_grad.sum()
        total_hess = sorted_hess.sum()
        
        for split in range(len(feature_data)-1):
            curr = sorted_feature_data[split]
            next_val = sorted_feature_data[split+1]

            left_grad += sorted_grad[split]
            left_hess += sorted_hess[split]
            right_grad -= sorted_grad[split]
            right_hess -= sorted_hess[split]

            left_sim = (left_grad ** 2)/(left_hess+self.reg_param)
            right_sim = (right_grad ** 2)/(right_hess+self.reg_param)
            root_sim = (total_grad ** 2)/(total_hess+self.reg_param)
            gain = left_sim+right_sim-root_sim

            if(gain > self.split_score):
                self.split_score = gain
                self.feature_col_index = feature_col_idx
                self.threshold = (curr+next_val)/2

    def predict(self, X):
        return np.array([self._predict_row(example) for example in X]) 

    def _predict_row(self, example):
        if self._is_leaf:
            return self.weight
        val = example[self.feature_col_index]
        if isinstance(example, str):
            print(f'example: {example}')
            print(f'val: {val}')
            print(f'self.threshold: {self.threshold}')
        path = self.left if val <= self.threshold else self.right
        return path._predict_row(example)

    @property
    def _is_leaf(self):
        return self.split_score == 0.0  # Leaf node if no gain found
        
                
                
                

        
        
        
        
        

In [3]:
class XGBoost:
    def __init__(self, params, objective, seed=42):
        self.trees = []  # Store all trained trees
        self.params = defaultdict(lambda: None, params)  # Default values for missing params
        self.objective = objective  # Loss function
        self.subsample = self.params['subsample'] if self.params['subsample'] else 1.0
        self.base_score = self.params['base_score'] if self.params['base_score'] else 0.5
        self.learning_rate = self.params['learning_rate'] if self.params['learning_rate'] else 1e-1
        self.max_depth = self.params['max_depth'] if self.params['max_depth'] else 5
        self.rng = np.random.default_rng(seed=seed)  # Random number generator
        
    def fit(self, X, y, num_rounds):
        predictions = self.base_score * np.ones(shape=y.shape)  # Initialize predictions
        for rnd in range(num_rounds):
            gradients = self.objective.gradients(y, predictions)  # Compute gradients
            hessians = self.objective.hessians(y, predictions)  # Compute hessians
            # Row sampling
            idxs = None if self.subsample == 1.0 else self.rng.choice(
                len(y),
                size=math.floor(self.subsample * len(y)),
                replace=False
            )
            tree = BoostedTree(
                data=X,
                data_indices=idxs,
                gradients=gradients,
                hessian=hessians,
                param=self.params,
                max_depth=self.max_depth,
                reg_param=1.5,
            
            )
            self.trees.append(tree)
            predictions += self.learning_rate * tree.predict(X)  # Update predictions
            
    def predict(self, X):
        # Add predictions from all trees
        return self.base_score + self.learning_rate * np.sum([tree.predict(X) for tree in self.trees], axis=0)

In [4]:
class XGBoostSigmoid:
    def __init__(self, params, threshold=0.5, seed=42):
        self.params = params
        self.threshold = threshold  # Threshold to classify sigmoid output as 0 or 1
        self.objective = BinaryCrossEntropyLoss()
        self.base = XGBoost(self.params, self.objective, seed)
        
    def train(self, X, y, num_rounds):
        self.base.fit(X, y, num_rounds)  # Train the underlying boosted trees
        
    def predict(self, X, with_labels=False, threshold=0.5):
        logits = self.base.predict(X)  # Get raw scores
        probs = self.objective.sigmoid(logits)  # Apply sigmoid to get probabilities
        if with_labels:
            return probs, (probs >= threshold).astype(int)
        return probs

In [5]:
class BinaryCrossEntropyLoss:
    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))
    
    @staticmethod
    def loss(labels, predictions):
        probs = BinaryCrossEntropyLoss.sigmoid(predictions)
        
        # To avoid log(0)
        epsilon = 1e-15
        
        probs = np.clip(probs, epsilon, 1 - epsilon)
        
        # Binary log loss
        return -np.mean(labels * np.log(probs) + (1 - labels) * np.log(1 - probs))

    @staticmethod
    def gradients(labels, predictions):
        probs = BinaryCrossEntropyLoss.sigmoid(predictions)
        
        # Gradient of binary cross-entropy
        return probs - labels

    @staticmethod
    def hessians(labels, predictions):
        probs = BinaryCrossEntropyLoss.sigmoid(predictions)
        
        # Hessian for sigmoid cross-entropy
        return probs * (1 - probs)

In [6]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
print(f"Training set contains {train.shape[0]} samples and {train.shape[1]} features.")
display(train.info())
display(train)

Training set contains 891 samples and 12 features.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


None

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [7]:
def preprocess_features(X, categorical_mapping=None, binary_from_null=None, drop_columns=None, fill_with_median=None):
    # Map categorical columns to integers
    if categorical_mapping:
        for col, mapping in categorical_mapping.items():
            if col in X.columns:
                X[col] = X[col].map(mapping).astype(int)

    # Convert null presence to binary flags
    if binary_from_null:
        for col in binary_from_null:
            if col in X.columns:
                X[col] = X[col].notnull().astype(int)

    # Drop irrelevant columns
    if drop_columns:
        X.drop(columns=[col for col in drop_columns if col in X.columns], inplace=True)

    # Fill missing values with the median
    if fill_with_median:
        X[fill_with_median] = X[fill_with_median].fillna(X[fill_with_median].median())

    return X


def split_data(X, y, test_percent=0.2, random_state=2025):
    # Convert pandas DataFrames to NumPy arrays if necessary
    if isinstance(X, pd.DataFrame):
        X = X.values

    if isinstance(y, pd.DataFrame):
        y = y.values

    # Get number of data points (rows)
    num_examples = X.shape[0]

    # Shuffle the dataset using the given random seed
    rng = np.random.default_rng(random_state)
    shuffled_indices = rng.permutation(num_examples)
    shuffled_X = X[shuffled_indices]
    shuffled_y = y[shuffled_indices]

    # Calculate number of test examples based on given test size percent
    test_size = math.ceil(test_percent * num_examples)

    # Split features: everything after the test_size is training
    X_train = shuffled_X[test_size:]
    X_test = shuffled_X[:test_size]

    # Split labels: reshape to ensure they're 1D and cast to int
    y_train = shuffled_y[test_size:].astype(int)
    y_test = shuffled_y[:test_size].astype(int)

    # Return split datasets
    return X_train, X_test, y_train, y_test

In [8]:
# Preprocessing configuration for the Titanic dataset
config = {
    'categorical_mapping': {'Sex': {'male': 0, 'female': 1}},
    'binary_from_null': ['Cabin'],
    'drop_columns': ['Name', 'Ticket', 'Embarked'],
    'fill_with_median': ['Age']
}

# Hyperparameters for the XGBoost model
hyperparameters = {
    'learning_rate': 0.1,
    'max_depth': 5,
    'subsample': 0.8,
    'reg_lambda': 1.5,
    'gamma': 1.0,
    'min_child_weight': 25,
    'base_score': 0.5,
}


num_boost_round = 50

# Load Titanic data and split into features and labels
labels = train.pop('Survived')

# Apply feature transformations based on config
train = preprocess_features(train, **config)

# Split data into train and test sets
X_train, X_test, y_train, y_test = split_data(train, labels)


In [9]:
# Train custom XGBoost model from scratch
model = XGBoostSigmoid(hyperparameters, seed=42)
model.train(X_train, y_train, num_boost_round)

probabilities, predictions = model.predict(X_test, with_labels=True)

In [10]:
# Accuracy
accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy:.4f}.")

# Precision
precision = precision_score(y_test, predictions)
print(f"Precision: {precision:.4f}.")

# Recall
recall = recall_score(y_test, predictions)
print(f"Recall: {recall:.4f}.")

# F1-Score
f1 = f1_score(y_test, predictions)
print(f"F1-Score: {f1:.4f}")

Accuracy: 0.7207.
Precision: 0.7000.
Recall: 0.5000.
F1-Score: 0.5833


In [11]:
config = {
    'categorical_mapping': {'Sex': {'male': 0, 'female': 1}},
    'binary_from_null': ['Cabin'],
    'drop_columns': ['Name', 'Ticket', 'Embarked'],
    'fill_with_median': ['Age']
}
test = pd.read_csv('/kaggle/input/titanic//test.csv')


passenger_ids = test['PassengerId'].values

test = preprocess_features(test, **config)
display(train.head())
display(test.head())

_, predictions = model.predict(test.values, with_labels=True)
submission = pd.DataFrame({'PassengerId': passenger_ids,'Survived': predictions})
submission.to_csv(f'/kaggle/working/submission.csv', index=False)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin
0,1,3,0,22.0,1,0,7.25,0
1,2,1,1,38.0,1,0,71.2833,1
2,3,3,1,26.0,0,0,7.925,0
3,4,1,1,35.0,1,0,53.1,1
4,5,3,0,35.0,0,0,8.05,0


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin
0,892,3,0,34.5,0,0,7.8292,0
1,893,3,1,47.0,1,0,7.0,0
2,894,2,0,62.0,0,0,9.6875,0
3,895,3,0,27.0,0,0,8.6625,0
4,896,3,1,22.0,1,1,12.2875,0
