# 📈 Regression Template

In this notebook, we explore regression using:

- **Synthetic data** with Gaussian noise
  
Evaluation metrics and visualizations are included to compare both implementations.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from colorama import Fore, Style
from numpy.typing import NDArray
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

## 🛠️ Utils

In [2]:
def evaluate_regression(
    y_true: NDArray[np.float64], y_pred: NDArray[np.float64]
) -> dict[str, float]:
    """Compute and print regression evaluation metrics.

    Args:
        y_true (NDArray[np.float64]): True target values.
        y_pred (NDArray[np.float64]): Predicted target values.

    Returns:
        dict[str, float]: MAE, MSE, and R² score.
    """
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"{Fore.CYAN}Mean Absolute Error (MAE):{Style.RESET_ALL} {mae:.4f}")
    print(f"{Fore.CYAN}Mean Squared Error (MSE):{Style.RESET_ALL} {mse:.4f}")
    print(f"{Fore.CYAN}R² Score:{Style.RESET_ALL} {r2:.4f}")

    return {"mae": mae, "mse": mse, "r2": r2}

In [3]:
def plot_regression_results(
    x: NDArray[np.float64],
    y: NDArray[np.float64],
    X_test: NDArray[np.float64],
    y_pred: NDArray[np.float64],
    w: float,
    b: float,
    title: str = "Regression Plot",
) -> None:
    """Plot noisy data points, model predictions, and the original regression line.

    Args:
        x (NDArray[np.float64]): Full input features used to generate noisy data.
        y (NDArray[np.float64]): Noisy target values.
        X_test (NDArray[np.float64]): Test feature values used for predictions.
        y_pred (NDArray[np.float64]): Predicted values from the regression model.
        w (float): True slope of the underlying model (for reference line).
        b (float): True intercept of the underlying model.
        title (str): Title of the plot. Default is "Regression Plot".
    """
    x_range = np.linspace(min(x), max(x), 100).reshape(-1, 1)
    y_line = b + x_range * w

    sns.scatterplot(x=x.flatten(), y=y.flatten(), alpha=0.3, label="Noisy data points")
    sns.lineplot(
        x=X_test.flatten(),
        y=y_pred.flatten(),
        color="red",
        label="Regression line (prediction)",
    )
    sns.lineplot(
        x=x_range.flatten(),
        y=y_line.flatten(),
        color="green",
        label="Original regression line (no noise)",
    )
    plt.title(title)
    plt.legend()
    plt.show()

## 📊 Generate Dataset

In [4]:
def generate_data(
    n: int, w: float, b: float, noise_level: float = 1.0, seed: int = 42
) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
    """Generate synthetic linear data with Gaussian noise.

    Args:
        n (int): Number of data points to generate.
        w (float): Slope of the linear function.
        b (float): Intercept of the linear function.
        noise_level (float): Standard deviation of Gaussian noise added to the data.
        seed (int): Seed for the random number generator.

    Returns:
        tuple[NDArray[np.float64], NDArray[np.float64]]: Tuple containing the list of x
        values and corresponding noisy y values.
    """
    rng = np.random.default_rng(seed)
    x = rng.uniform(0, 25, n)
    y = w * x + b + rng.normal(0, noise_level, n)
    return x, y


# Set true parameters of the linear relationship
w = 2.5  # Slope coefficient
b = 1.5  # Intercept (bias term)

# Generate dataset
x, y = generate_data(n=1000, w=w, b=b, noise_level=4.5)

In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

# Convert lists to numpy arrays with correct shape for sklearn
X_train_np = np.array(X_train).reshape(-1, 1)
X_test_np = np.array(X_test).reshape(-1, 1)