In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Set the seed for reproducibility
np.random.seed(42)

# Number of samples
num_samples = 100

# Generate random x values (features)
x_train = 2 * np.random.rand(num_samples, 1)

# Generate corresponding y values with a linear relationship and noise
true_slope = 3
true_intercept = 4
noise = np.random.randn(num_samples, 1)
y_train = true_intercept + true_slope * x_train + noise

# Plot the generated data
plt.figure(figsize=(8, 5))
plt.scatter(x_train, y_train, color="blue", label="Training data", alpha=0.7)
plt.title("Dummy Linear Regression Data")
plt.xlabel("x_train")
plt.ylabel("y_train")
plt.legend()
plt.grid(True)
plt.show()

print("Features shape is:", x_train.shape)
print("Target shape is:", y_train.shape)

# Generate a random threshold between two values
def thresh(x1, x2):
    threshold = np.random.uniform(low=x1, high=x2)
    return threshold

# Compute variance (used as impurity measure)
def vari(x):
    return np.var(x)

# Find the best split point that maximizes variance reduction (information gain)
def find_best_split(x):
    best_IG = 0
    best_threshold = None
    n = len(x)

    for i in range(len(x) - 1):
        t = thresh(x[i], x[i + 1])
        left = x[x <= t]
        right = x[x > t]

        if len(left) == 0 or len(right) == 0:
            continue

        H = vari(x)
        H_left = vari(left)
        H_right = vari(right)
        IG = H - (len(left) / n) * H_left - (len(right) / n) * H_right

        if IG > best_IG:
            best_IG = IG
            best_threshold = t

    return best_threshold, best_IG

# Recursively build the regression tree
def build_tree(x, y, depth=0, max_depth=3, min_size=5):
    if len(x) < min_size or depth >= max_depth:
        return {"leaf": True, "value": np.mean(y)}

    threshold, ig = find_best_split(x)
    if threshold is None or ig <= 0:
        return {"leaf": True, "value": np.mean(y)}

    left_indices = x < threshold
    right_indices = x >= threshold

    left_x, right_x = x[left_indices], x[right_indices]
    left_y, right_y = y[left_indices], y[right_indices]

    return {
        "leaf": False,
        "threshold": threshold,
        "info_gain": ig,
        "left": build_tree(left_x, left_y, depth + 1, max_depth, min_size),
        "right": build_tree(right_x, right_y, depth + 1, min_size=min_size)
    }

# Build the tree
tree = build_tree(x_train, y_train, depth=3, max_depth=5)

# Recursive prediction for one sample
def predict_tree(tree, x):
    if tree["leaf"]:
        return tree["value"]
    else:
        threshold = tree["threshold"]
        if isinstance(threshold, np.ndarray):
            threshold = threshold.item()
        if x < threshold:
            return predict_tree(tree["left"], x)
        else:
            return predict_tree(tree["right"], x)

# Predict for all samples
def predict_all(tree, X):
    return np.array([predict_tree(tree, x) for x in X])

# Visualize regression predictions
def plot_regression_predictions(x_train, y_train, tree):
    x_line = np.linspace(np.min(x_train), np.max(x_train), 300)
    y_pred_line = predict_all(tree, x_line)

    plt.scatter(x_train, y_train, color="blue", label="Training Data")
    plt.plot(x_line, y_pred_line, color="red", linewidth=2, label="Tree Prediction")
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.title("Regression Tree Prediction")
    plt.legend()
    plt.grid(True)
    plt.show()

# Plot predictions
plot_regression_predictions(x_train, y_train, tree)
