# Linear Regression from Scratch

This notebook implements linear regression from scratch using NumPy, demonstrating the mathematical foundations and implementation details.

In [None]:
# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple, Optional
import sys
import os

# Add project root to path
sys.path.append(os.path.join(os.path.dirname("__file__"), '..', '..'))

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Mathematical Foundation of Linear Regression

Linear regression models the relationship between a dependent variable $y$ and one or more independent variables $X$ using a linear approach.

### Hypothesis Function
$$h_\theta(x) = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + \cdots + \theta_n x_n = \theta^T x$$

### Cost Function (Mean Squared Error)
$$J(\theta) = \frac{1}{2m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})^2$$

### Normal Equation (Closed-form solution)
$$\theta = (X^T X)^{-1} X^T y$$

### Gradient Descent
$$\theta_j := \theta_j - \alpha \frac{\partial}{\partial \theta_j} J(\theta)$$

Where $\frac{\partial}{\partial \theta_j} J(\theta) = \frac{1}{m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)}) x_j^{(i)}$

In [None]:
# Implement Linear Regression from scratch

class LinearRegressionFromScratch:
    """
    Linear Regression implementation from scratch using NumPy.
    Supports both closed-form solution (Normal Equation) and gradient descent.
    """
    
    def __init__(self, method: str = 'normal', learning_rate: float = 0.01, 
                 n_iterations: int = 1000, regularization: Optional[str] = None, 
                 lambda_reg: float = 0.01):
        """
        Initialize Linear Regression model.
        
        Args:
            method: 'normal' for closed-form solution, 'gd' for gradient descent
            learning_rate: Learning rate for gradient descent
            n_iterations: Number of iterations for gradient descent
            regularization: 'l1', 'l2', or None
            lambda_reg: Regularization strength
        """
        self.method = method
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.regularization = regularization
        self.lambda_reg = lambda_reg
        self.weights = None
        self.bias = None
        self.cost_history = []
    
    def _add_bias(self, X: np.ndarray) -> np.ndarray:
        """Add bias term to feature matrix."""
        return np.c_[np.ones(X.shape[0]), X]
    
    def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearRegressionFromScratch':
        """
        Fit the linear regression model.
        
        Args:
            X: Training features of shape (n_samples, n_features)
            y: Training targets of shape (n_samples,)
            
        Returns:
            Self (for method chaining)
        """
        X = np.asarray(X)
        y = np.asarray(y)
        
        if self.method == 'normal':
            # Closed-form solution: theta = (X^T * X)^(-1) * X^T * y
            # Add bias term
            X_with_bias = self._add_bias(X)
            
            # Regularization term
            reg_term = np.zeros((X_with_bias.shape[1], X_with_bias.shape[1]))
            if self.regularization == 'l2':
                # Ridge regression: add lambda * I to (X^T * X)
                reg_term[1:, 1:] = self.lambda_reg * np.eye(X_with_bias.shape[1] - 1)
            elif self.regularization == 'l1':
                # Lasso: we'll use gradient descent for L1
                self.method = 'gd'
        
        if self.method == 'normal' and self.regularization != 'l1':
            # Calculate weights using normal equation
            XtX_inv = np.linalg.inv(X_with_bias.T @ X_with_bias + reg_term)
            weights = XtX_inv @ X_with_bias.T @ y
            self.bias = weights[0]
            self.weights = weights[1:]
        elif self.method == 'gd' or self.regularization == 'l1':
            # Use gradient descent
            n_features = X.shape[1]
            self.weights = np.random.normal(0, 0.01, n_features)
            self.bias = 0.0
            
            for i in range(self.n_iterations):
                # Forward pass
                y_pred = self.predict(X)
                
                # Calculate gradients
                dw, db = self._compute_gradients(X, y, y_pred)
                
                # Update parameters
                self.weights -= self.learning_rate * dw
                self.bias -= self.learning_rate * db
                
                # Calculate and store cost
                cost = self._compute_cost(y, y_pred)
                self.cost_history.append(cost)
        
        return self
    
    def _compute_gradients(self, X: np.ndarray, y: np.ndarray, 
                          y_pred: np.ndarray) -> Tuple[np.ndarray, float]:
        """Compute gradients for weights and bias."""
        m = X.shape[0]
        error = y_pred - y
        
        # Gradient for weights
        dw = (1/m) * X.T @ error
        
        # Add regularization term
        if self.regularization == 'l2':
            dw += self.lambda_reg * self.weights
        elif self.regularization == 'l1':
            # Subgradient for L1: sign of weights
            dw += self.lambda_reg * np.sign(self.weights)
        
        # Gradient for bias
        db = (1/m) * np.sum(error)
        
        return dw, db
    
    def _compute_cost(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """Compute mean squared error cost."""
        m = y_true.shape[0]
        mse = (1/(2*m)) * np.sum((y_true - y_pred) ** 2)
        
        # Add regularization term
        if self.regularization == 'l2':
            mse += (self.lambda_reg / (2*m)) * np.sum(self.weights ** 2)
        elif self.regularization == 'l1':
            mse += (self.lambda_reg / m) * np.sum(np.abs(self.weights))
        
        return mse
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Make predictions on new data.
        
        Args:
            X: Features of shape (n_samples, n_features)
            
        Returns:
            Predictions of shape (n_samples,)
        """
        X = np.asarray(X)
        return X @ self.weights + self.bias
    
    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        """
        Calculate R-squared score.
        
        Args:
            X: Features of shape (n_samples, n_features)
            y: True targets of shape (n_samples,)
            
        Returns:
            R-squared score
        """
        X = np.asarray(X)
        y = np.asarray(y)
        
        y_pred = self.predict(X)
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        
        return 1 - (ss_res / ss_tot)

In [None]:
# Test the Linear Regression implementation
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression as SklearnLinearRegression

# Generate sample data
X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train our model
lr_model = LinearRegressionFromScratch(method='gd', learning_rate=0.01, n_iterations=1000)
lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = lr_model.predict(X_test_scaled)

# Calculate metrics
score = lr_model.score(X_test_scaled, y_test)
mse = np.mean((y_test - y_pred) ** 2)

print(f"Our Linear Regression - R-squared: {score:.4f}, MSE: {mse:.4f}")

# Compare with sklearn
sklearn_lr = SklearnLinearRegression()
sklearn_lr.fit(X_train_scaled, y_train)
sklearn_pred = sklearn_lr.predict(X_test_scaled)
sklearn_score = sklearn_lr.score(X_test_scaled, y_test)
sklearn_mse = np.mean((y_test - sklearn_pred) ** 2)

print(f"Sklearn Linear Regression - R-squared: {sklearn_score:.4f}, MSE: {sklearn_mse:.4f}")

# Plot results
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(X_test_scaled, y_test, alpha=0.6, label='True values')
plt.plot(X_test_scaled, y_pred, color='red', linewidth=2, label='Our Predictions')
plt.plot(X_test_scaled, sklearn_pred, color='green', linewidth=2, label='Sklearn Predictions', linestyle='--')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.title('Linear Regression Comparison')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(lr_model.cost_history)
plt.title('Cost Function Over Iterations')
plt.xlabel('Iteration')
plt.ylabel('Cost (MSE)')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Implement Ridge and Lasso Regression

class RidgeRegression(LinearRegressionFromScratch):
    """
    Ridge Regression (L2 regularization) implementation.
    """
    
    def __init__(self, learning_rate: float = 0.01, n_iterations: int = 1000, lambda_reg: float = 0.01):
        super().__init__(
            method='gd',
            learning_rate=learning_rate,
            n_iterations=n_iterations,
            regularization='l2',
            lambda_reg=lambda_reg
        )

class LassoRegression(LinearRegressionFromScratch):
    """
    Lasso Regression (L1 regularization) implementation.
    """
    
    def __init__(self, learning_rate: float = 0.01, n_iterations: int = 1000, lambda_reg: float = 0.01):
        super().__init__(
            method='gd',
            learning_rate=learning_rate,
            n_iterations=n_iterations,
            regularization='l1',
            lambda_reg=lambda_reg
        )

# Test regularization
from sklearn.datasets import make_regression

# Generate data with more features to see regularization effects
X, y = make_regression(n_samples=100, n_features=10, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train models with different regularization
models = {
    "Linear": LinearRegressionFromScratch(method='gd', learning_rate=0.01, n_iterations=1000),
    "Ridge": RidgeRegression(learning_rate=0.01, n_iterations=1000, lambda_reg=1.0),
    "Lasso": LassoRegression(learning_rate=0.01, n_iterations=1000, lambda_reg=1.0)
}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    score = model.score(X_test_scaled, y_test)
    results[name] = {"model": model, "score": score, "predictions": y_pred}
    print(f"{name} Regression - R-squared: {score:.4f}")

# Plot regularization comparison
plt.figure(figsize=(15, 5))

for i, (name, result) in enumerate(results.items()):
    plt.subplot(1, 3, i+1)
    plt.scatter(y_test, result["predictions"], alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('True Values')
    plt.ylabel('Predictions')
    plt.title(f'{name} Regression\nRÂ² = {result["score"]:.4f}')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Key Takeaways

1. **Mathematical Foundation**: Understanding the mathematical foundation of linear regression is crucial for proper implementation
2. **Implementation Details**: Implementing from scratch reveals important details about numerical stability and convergence
3. **Regularization**: Regularization techniques (L1/L2) help prevent overfitting
4. **Gradient Descent**: Understanding optimization algorithms is essential for more complex models
5. **Verification**: Always verify your implementation against established libraries