# Implement XGBoost regression

In [5]:
import numpy as np

X = np.array([23, 24, 26, 27])
Y = np.array([50, 70, 80, 85])

lambda_param = 0
depth = 1
lr = 0.3

def calculate_similarity_score(residuals):
    return (np.sum(residuals) ** 2) / (len(residuals) + lambda_param)

def build_tree(X, Y, depth):
    # Step 1: Initial value f0 of model
    f0 = np.mean(Y)
    print(f"Initial value: {f0}")
    residuals = Y - f0

    # Step 2: calculate similarity score of root
    root_score = calculate_similarity_score(residuals)
    print(f"Similarity score: {root_score}")
    
    # Step 3: Choosing root and calculate similarity score for it
    split_points = [23.5, 25, 26.5]
    best_split = None
    best_gain = -np.inf
    
    for split in split_points:
        left_mask = X < split
        right_mask = X >= split
        
        left_score = calculate_similarity_score(residuals[left_mask])
        right_score = calculate_similarity_score(residuals[right_mask])
        
        gain = left_score + right_score - root_score
        
        if split == 23.5:
            print(f"left score of 23.5: {left_score}")
        
        if gain > best_gain:
            best_gain = gain
            best_split = split
    
    # Step 4: Calculate gain and choose best gain
    print(f"Best split: {best_split}")
    print(f"Best gain: {best_gain}")
    
    # Step 5: Tính giá trị Output cho từng node
    left_mask = X < best_split
    right_mask = X >= best_split
    
    left_output = np.sum(residuals[left_mask]) / (np.sum(left_mask) + lambda_param)
    right_output = np.sum(residuals[right_mask]) / (np.sum(right_mask) + lambda_param)
    
    # Step 6: predict result with x = 25
    if 25 < best_split:
        prediction = f0 + lr * left_output
    else:
        prediction = f0 + lr * right_output
    
    return prediction

result = build_tree(X, Y, depth)
print(f"Predicted value for x = 25: {result}")

Initial value: 71.25
Similarity score: 0.0
left score of 23.5: 451.5625
Best split: 23.5
Best gain: 602.0833333333334
Predicted value for x = 25: 73.375


# Implement XGBoost for classification

In [11]:
import numpy as np

X = np.array([23, 24, 26, 27])
Y = np.array([0, 0, 1, 1])

lambda_param = 0
depth = 1
lr = 0.3

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def calculate_similarity_score(residuals, previous_probability):
    sum_term = np.sum(previous_probability * (1 - previous_probability))
    return (np.sum(residuals) ** 2) / (sum_term + lambda_param)

def build_tree(X, Y, depth):
    # Step 1: Inititial value f0 of model
    f0 = 0.5  # Initial probability
    previous_probability = np.full_like(Y, f0, dtype=float)
    
    # Change Y into number
    Y_numeric = Y.astype(int)
    
    residuals = Y_numeric - previous_probability
    print(f"residuals: {residuals}")

    # Step 2: Calculate similarity score of root
    root_score = calculate_similarity_score(residuals, previous_probability)
    print(f"root score: {root_score}")
    
    # Step 3: Choosing root and calculate similarity score of it
    split_points = [23.5, 25, 26.5]
    best_split = None
    best_gain = -np.inf
    
    for split in split_points:
        left_mask = X < split
        right_mask = X >= split
        
        left_score = calculate_similarity_score(residuals[left_mask], previous_probability[left_mask])
        right_score = calculate_similarity_score(residuals[right_mask], previous_probability[right_mask])
        
        if split == 26.5:
            print(f"left score: {left_score}")
        
        gain = left_score + right_score - root_score
        
        if split == 25:
            print(f"gain with 25: {gain}")
        
        if gain > best_gain:
            best_gain = gain
            best_split = split
    
    # Step 4: Calculate gain and choose the best gain
    print(f"Best split: {best_split}")
    print(f"Best gain: {best_gain}")
    
    # Step 5: Calculate each node
    left_mask = X < best_split
    right_mask = X >= split
    
    left_output = np.sum(residuals[left_mask]) / np.sum(previous_probability[left_mask] * (1 - previous_probability[left_mask]))
    right_output = np.sum(residuals[right_mask]) / np.sum(previous_probability[right_mask] * (1 - previous_probability[right_mask]))
    
    # Step 6: Predict value x = 25
    if 25 < best_split:
        log_prediction = np.log(f0 / (1 - f0)) + lr * left_output
    else:
        log_prediction = np.log(f0 / (1 - f0)) + lr * right_output
    
    probability = sigmoid(log_prediction)
    
    return probability


result = build_tree(X, Y, depth)
print(f"Predicted probability for x = 25: {result}")

residuals: [-0.5 -0.5  0.5  0.5]
root score: 0.0
gain with 25: 4.0
left score: 0.3333333333333333
Best split: 25
Best gain: 4.0
Predicted probability for x = 25: 0.6456563062257954
