# Hard coded Gradient Boosted Tree for Regression

### This is the kaggle dataset link

### https://www.kaggle.com/datasets/abhishek14398/salary-dataset-simple-linear-regression

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# importing the dataset
df = pd.read_csv('Salary_dataset.csv')
df = df.drop(df.columns[0], axis=1) # Drop id Column
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

### GBT function

In [3]:
def GBTRegression(X, y, learning_rate, num_iters):
    '''
    F_x :             Cumulative prediction at any step in gradient boosting
    residuals:        The difference between the actual values and the current predictions
    h_x:              The prediction from the new tree fitted to the residuals at each boosting step
    learning_rate:    A scaling factor applied to the predictions of each new tree, controlling how much each tree contributes to the final model
    '''
    F_x = np.mean(y) * np.ones_like(y)
    
    for i in range(num_iters):
        residuals = y - F_x
        h_x = residuals 
        F_x = F_x + learning_rate*h_x
    
    return F_x

In [4]:
learning_rate = 0.1
num_iters = 1000
y_pred = GBTRegression(X, y, learning_rate, num_iters)
print(f"Predictions:\n{y_pred}")

Predictions:
0      39344.0
1      46206.0
2      37732.0
3      43526.0
4      39892.0
5      56643.0
6      60151.0
7      54446.0
8      64446.0
9      57190.0
10     63219.0
11     55795.0
12     56958.0
13     57082.0
14     61112.0
15     67939.0
16     66030.0
17     83089.0
18     81364.0
19     93941.0
20     91739.0
21     98274.0
22    101303.0
23    113813.0
24    109432.0
25    105583.0
26    116970.0
27    112636.0
28    122392.0
29    121873.0
Name: Salary, dtype: float64


In [5]:
# Evaluation
mse = mean_squared_error(y, y_pred)
print(f"Mean Squared Error: {mse}")
r2 = r2_score(y, y_pred)
print(f"R² Score: {r2}")

Mean Squared Error: 2.117582368135751e-21
R² Score: 1.0


# Gradient Boosted Tree for Regression (sklearn)

In [6]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Gradient Boosting Regressor
model = GradientBoostingRegressor(
    n_estimators=10000,    # Number of boosting stages
    learning_rate=0.1,   
    max_depth=10,         # Maximum depth of the individual regression estimators
    random_state=0
)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [8]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

Mean Squared Error: 69191443.33329761
R² Score: 0.8927767929694809


# Hard coded Gradient Boosted Tree for Classification

In [9]:
# Define dataset
X = np.array([1, 2, 3])  # Input features
y = np.array([0, 1, 1])  # Binary target values (0 or 1)
learning_rate = 0.1
num_iterations = 3


initial_log_odds = np.log(np.mean(y) / (1 - np.mean(y)))
F_x = initial_log_odds * np.ones_like(y)  # Initial prediction log-odds
print(f"Initial Prediction Log-Odds (F_0): {F_x}")

# Sigmoid function to convert log-odds to probabilities
def sigmoid(log_odds):
    return 1 / (1 + np.exp(-log_odds))

for i in range(num_iterations):
    p = sigmoid(F_x)
    residuals = y - p  
    print(f"\nIteration {i+1}")
    print(f"Probabilities: {p}")
    print(f"Pseudo-Residuals: {residuals}")
    
    h_x = residuals
    F_x = F_x + learning_rate * h_x
    print(f"Updated Log-Odds Prediction (F_{i+1}): {F_x}")
    
final_probabilities = sigmoid(F_x)
final_predictions = (final_probabilities >= 0.5).astype(int)  # Convert to binary predictions
print("\nFinal Predicted Probabilities:", final_probabilities)
print("Final Binary Predictions:", final_predictions)


Initial Prediction Log-Odds (F_0): [0.69314718 0.69314718 0.69314718]

Iteration 1
Probabilities: [0.66666667 0.66666667 0.66666667]
Pseudo-Residuals: [-0.66666667  0.33333333  0.33333333]
Updated Log-Odds Prediction (F_1): [0.62648051 0.72648051 0.72648051]

Iteration 2
Probabilities: [0.651691   0.67403247 0.67403247]
Pseudo-Residuals: [-0.651691    0.32596753  0.32596753]
Updated Log-Odds Prediction (F_2): [0.56131141 0.75907727 0.75907727]

Iteration 3
Probabilities: [0.63675592 0.68115336 0.68115336]
Pseudo-Residuals: [-0.63675592  0.31884664  0.31884664]
Updated Log-Odds Prediction (F_3): [0.49763582 0.79096193 0.79096193]

Final Predicted Probabilities: [0.62190358 0.68803784 0.68803784]
Final Binary Predictions: [1 1 1]


# Gradient Boosted Tree for Classification (sklearn)

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [11]:
X = np.array([[1], [2], [3]]) # X needs to be 2D array for the sklearn model

In [12]:
model = GradientBoostingClassifier(
    n_estimators=3,       
    learning_rate=0.1,    
    max_depth=1,          
    random_state=42
)
model.fit(X, y)

# Make predictions
y_pred = model.predict(X)

In [13]:
y_pred

array([0, 1, 1])

In [14]:
# Evaluate the model
accuracy = accuracy_score(y, y_pred)
print("\nFinal Binary Predictions:", y_pred)
print(f"Accuracy: {accuracy}")


Final Binary Predictions: [0 1 1]
Accuracy: 1.0
