In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

# Hard-coded XGBoost for regression

In [2]:
# Steps:
# 1. Make initial prediction
# 2. Compute Gradients and Hessians
# 3. Fit into a new tree
# 4. update the prediction
# 5. Repeat the process form step 2 to step 4

In [3]:
import numpy as np

def XGBoostRegressionWithCoverPruning(X, y, learning_rate, num_iters, lambda_, lambda_l1=0.1, lambda_l2=0.1, pruning_threshold=0.01, min_cover=10):
    '''
    XGBoost function for regression tasks with pruning based on residuals and cover,
    including L1 and L2 regularization
    
    X                        Features matrix
    y                        Target
    learning_rate            Model learning rate
    num_iters                Number of boosting iterations
    lambda_                  Regularization parameter
    lambda_l1                L1 regularization strength (Lasso)
    lambda_l2                L2 regularization strength (Ridge)
    pruning_threshold        Threshold to stop further boosting (based on residuals)
    min_cover                Minimum number of samples required in a leaf node
    
    F_x                      Cumulative prediction at each boosting step
    gradients                First derivative of the loss function 
    hessians                 Second derivative of the loss function (Number of residuals)
    h_x                      Prediction from each boosting step
    '''
    
    F_x = np.mean(y) * np.ones_like(y)                  # First prediction
    for i in range(num_iters):
        residuals = y - F_x
        gradients = -residuals                            # First derivative of the loss function
        hessians = np.ones_like(y)                        # Second derivative (Hessian)
        
        # Apply L1 regularization to the gradients
        gradients += lambda_l1 * np.sign(gradients)  # L1 regularization term
        
        # Apply L2 regularization to the hessians
        hessians += lambda_l2 * np.ones_like(hessians)  # L2 regularization term (adding to the Hessian)
        
        # Fitting into new tree
        h_x = -gradients / (hessians + lambda_)
        
        # Calculate the cover (number of samples per leaf)
        cover = np.sum(np.abs(gradients) >= pruning_threshold)  # Simplified cover calculation
        
        # Check if pruning condition is met based on residuals and cover
        if np.mean(np.abs(residuals)) < pruning_threshold or cover < min_cover:
            print(f"Pruning: Stopping boosting at iteration {i+1} due to low cover or small residuals")
            break
        
        F_x += learning_rate * h_x  # Updating cumulative prediction
    
    return F_x


### Demo 

In [4]:
np.random.seed(42)
X = np.random.rand(1000, 10)  # 1000 samples, 10 features
y = np.random.rand(1000)      # 1000 target values

learning_rate = 0.1
num_iters = 100
lambda_ = 1.0
pruning_threshold = 0.01
min_cover = 10  # Minimum samples per leaf

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_test_pred = XGBoostRegressionWithCoverPruning(X_test, y_test, learning_rate, num_iters, lambda_, pruning_threshold, min_cover)

print(f"First 5 rows predictions:")
for i in range(5):
    print(f"Actual value: {y_test[i]}        Prediction: {y_test_pred[i]}")

First 5 rows predictions:
Actual value: 0.6789030208597069        Prediction: 0.6140266633467152
Actual value: 0.48640909899108653        Prediction: 0.4935603309753998
Actual value: 0.16540090450135903        Prediction: 0.31157545577871276
Actual value: 0.4705050789923272        Prediction: 0.4845440779816075
Actual value: 0.785202867912825        Prediction: 0.6742898115511999


In [5]:
# Evaluate the test performance
mse_test = mean_squared_error(y_test, y_test_pred)
print(f"Test Mean Squared Error: {mse_test}")

Test Mean Squared Error: 0.015090762389554011


# XGBoost for regression (sklean)

In [6]:
from xgboost import XGBRegressor

In [7]:
xgb_model = XGBRegressor(
    n_estimators=100,           # Number of boosting rounds
    learning_rate=0.1,          # Learning rate
    max_depth=3,                # Maximum depth of the trees (controls tree complexity)
    min_child_weight=10,        # Minimum sum of weights for a child (prunes small leaves)
    gamma=0.1,                  # Minimum loss reduction (controls pruning based on gain)
    random_state=42             # Set random seed for reproducibility
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_test_pred = xgb_model.predict(X_test)

In [8]:
# Evaluate the model's performance on the test set
mse_test = mean_squared_error(y_test, y_test_pred)
print(f"Test Mean Squared Error: {mse_test}")

Test Mean Squared Error: 0.08967959861851739


In [9]:
# MSE values are diffrent because sklean XGBoost model uses L1 and L2 regularization

# Hard-coded XGBoost for classification

In [10]:
def XGBoostClassification(X, y, learning_rate, num_iters, lambda_):
    '''
    XGBoost function for binary classification tasks
    
    X                        Features matrix
    y                        Target (binary labels, 0 or 1)
    learning_rate            Model learning rate
    num_iters                Number of boosting iterations
    lambda_                  Regularization parameter
    
    F_x                      Cumulative prediction at each boosting step (log-odds)
    gradients                First derivative of the loss function 
    hessians                 Second derivative of the loss function
    h_x                      Prediction from each boosting step   
    '''
    
    # Initialize predictions with log-odds 
    F_x = np.log(np.mean(y) / (1 - np.mean(y))) * np.ones_like(y)
    log_odds_steps = [F_x.copy()]  # Track log-odds over iterations
    probability_steps = [1 / (1 + np.exp(-F_x))]  # Track probabilities
    
    for i in range(num_iters):
        # Convert log-odds to probability using the sigmoid function
        preds = 1 / (1 + np.exp(-F_x))
       
        gradients = preds - y                              
        hessians = preds * (1 - preds)                     
        h_x = -gradients / (hessians + lambda_)            # Fitting new tree
        F_x += learning_rate * h_x                         # Updating
        
        # Store intermediate log-odds and probabilities for visualization
        log_odds_steps.append(F_x.copy())
        probability_steps.append(1 / (1 + np.exp(-F_x)))
    
    # Convert final log-odds predictions to probabilities
    final_preds = 1 / (1 + np.exp(-F_x))
    return final_preds, log_odds_steps, probability_steps

### Demo

In [11]:
# Parameters for dataset generation
n_samples = 1000         # Number of samples
n_features = 10          # Total number of features
n_informative = 5        # Number of informative features
n_redundant = 2          # Number of redundant features
n_classes = 2            # Number of classes (binary classification)

# Generate synthetic data
X, y = make_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=n_informative,
    n_redundant=n_redundant,
    n_classes=n_classes,
    random_state=42
)

# Create a DataFrame for easier analysis
data = pd.DataFrame(X, columns=[f'feature_{i+1}' for i in range(n_features)])
data['target'] = y

print(data.head())

   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0   1.125100   1.178124   0.493516   0.790880  -0.614278   1.347020   
1  -0.564641   3.638629  -1.522415  -1.541705   1.616697   4.781310   
2   0.516313   2.165426  -0.628486  -0.386923   0.492518   1.442381   
3   0.537282   0.966618  -0.115420   0.670755  -0.958516   0.871440   
4   0.278385   1.065828  -1.724917  -2.235667   0.715107   0.731249   

   feature_7  feature_8  feature_9  feature_10  target  
0   1.419515   1.357325   0.966041   -1.981139       1  
1   3.190292  -0.890254   1.438826   -3.828748       0  
2   1.332905  -1.958175  -0.348803   -1.804124       0  
3   0.508186  -1.034471  -1.654176   -1.910503       1  
4  -0.674119   0.598330  -0.524283    1.047610       0  


In [12]:
learning_rate = 0.1
num_iters = 100
lambda_ = 1.0

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_test_pred, log_odds_steps, probability_steps = XGBoostClassification(X_test, y_test, learning_rate, num_iters, lambda_)

# Assuming threshhold is 0.5
y_test_pred_class = np.where(y_test_pred >= 0.5, 1, 0)     

# Display first 5 actual values and predictions, and print accuracy
print("First 5 rows predictions:")
for i in range(5):
    print(f"Actual value: {y_test[i]}        Prediction: {y_test_pred_class[i]}")

First 5 rows predictions:
Actual value: 0        Prediction: 0
Actual value: 1        Prediction: 1
Actual value: 0        Prediction: 0
Actual value: 0        Prediction: 0
Actual value: 0        Prediction: 0


In [13]:
# Evaluate the test performance
accuracy = accuracy_score(y_test, y_test_pred_class)
mse_test = mean_squared_error(y_test, y_test_pred)
print(f"\nTest Accuracy: {accuracy:.2f}")
print(f"Test Mean Squared Error: {mse_test:.3f}")


Test Accuracy: 1.00
Test Mean Squared Error: 0.013


# XGBoost for classification (sklean)

In [14]:
from xgboost import XGBClassifier

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_test_pred = xgb_model.predict(X_test)

In [16]:
# Evaluate the model's performance using accuracy
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {accuracy_test:.2f}")
# Evaluate the test performance
mse_test = mean_squared_error(y_test, y_test_pred)
print(f"Test Mean Squared Error: {mse_test}")

Test Accuracy: 0.95
Test Mean Squared Error: 0.045
