# Import

In [1]:
import numpy as np
import time
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score

np.random.seed(1022025)

In [2]:
def MSE(y_pred : np.array, y : np.array):
    return np.mean((y_pred - y) ** 2)

In [3]:
class LinearRegressionImplement:
    def __init__(self, learning_rate=0.01, epochs=100, batch_size=32, l1_reg=0.0, l2_reg=0.0, tol=1e-3, patience=10):
        """
        Initialize the Linear Regression model using Mini-batch SGD.

        Parameters:
            learning_rate (float): Learning rate for gradient descent.
            epochs (int): Number of training iterations.
            batch_size (int): Number of samples per mini-batch.
            l1_reg (float): L1 regularization coefficient (λ1).
            l2_reg (float): L2 regularization coefficient (λ2).
            tol (float): Tolerance for early stopping (min loss improvement).
            patience (int): Number of epochs to wait before early stopping.
        """
        self.lr = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.l1_reg = l1_reg
        self.l2_reg = l2_reg
        self.tol = tol
        self.patience = patience
        self.weights = None
        self.loss_history = []

    def fit(self, X, y, verbose = False):
        """
        Train the model using Mini-batch Stochastic Gradient Descent (Mini-batch SGD).

        Parameters:
            X (np.array): Feature matrix.
            y (np.array): Target vector.

        Returns:
            self: The fitted model.
        """
        # Add a bias term (column of ones)
        X_b = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)

        # Initialize weights randomly
        n_samples, n_features = X_b.shape
        self.weights = np.random.randn(n_features)

        # Early stopping variables
        best_loss = float("inf")
        no_improve_count = 0

        # Training loop
        for epoch in range(self.epochs):
            # Shuffle data
            shuffled_indices = np.random.permutation(n_samples)
            X_shuffled, y_shuffled = X_b[shuffled_indices], y[shuffled_indices]

            # Process mini-batches
            for i in range(0, n_samples, self.batch_size):
                X_batch = X_shuffled[i:i + self.batch_size]
                y_batch = y_shuffled[i:i + self.batch_size]

                # Compute predictions
                y_pred = np.dot(X_batch, self.weights)
                error = y_pred - y_batch

                # Compute gradient
                grad = np.dot(X_batch.T, error) / len(y_batch)

                # Regularization terms (avoid bias regularization)
                reg_mask = np.concatenate(([0], np.ones(n_features - 1)))
                grad_l1 = self.l1_reg * np.sign(self.weights) * reg_mask
                grad_l2 = self.l2_reg * self.weights * reg_mask

                # Total gradient update
                self.weights -= self.lr * (grad + grad_l1 + grad_l2)

            # Compute loss after full epoch
            y_pred_all = np.dot(X_b, self.weights)
            mse = np.mean((y - y_pred_all) ** 2)
            self.loss_history.append(mse)

            # Print loss every 10 epochs
            if epoch % 5 == 0 and verbose:
                print(f"Epoch {epoch}: Loss = {mse:.6f}")

            # Early stopping check
            if mse < best_loss - self.tol:
                best_loss = mse
                no_improve_count = 0
            else:
                no_improve_count += 1
                if no_improve_count >= self.patience:
                    print(f"Early stopping at epoch {epoch}. Best loss: {best_loss:.6f}")
                    break

        return self

    def predict(self, X):
        """
        Make predictions using the trained linear regression model.

        Parameters:
            X (np.array): Feature matrix.

        Returns:
            np.array: Predicted values.
        """
        X_b = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
        return np.dot(X_b, self.weights)


# Evaluation

In [4]:
# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target  # Features and labels

# Split dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# === Train Custom Model ===
start_time = time.time()
custom_model = LinearRegressionImplement(learning_rate=0.1, epochs=100, batch_size=64, l1_reg=0.001, l2_reg=0.005)
custom_model.fit(X_train, y_train)
end_time = time.time()
custom_time = end_time - start_time

# Predictions
y_pred_custom = custom_model.predict(X_test)

# === Train Sklearn Model ===
start_time = time.time()
sklearn_model = LinearRegression()
sklearn_model.fit(X_train, y_train)
end_time = time.time()
sklearn_time = end_time - start_time

# Predictions
y_pred_sklearn = sklearn_model.predict(X_test)

# === Evaluation Metrics ===
def evaluate(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, np.round(y_pred))  # Round predictions to 0 or 1
    return mse, mae, r2, accuracy

mse_custom, mae_custom, r2_custom, acc_custom = evaluate(y_test, y_pred_custom)
mse_sklearn, mae_sklearn, r2_sklearn, acc_sklearn = evaluate(y_test, y_pred_sklearn)

# === Print Evaluation Table ===
print("\nEvaluation Metrics Comparison:")
print("="*65)
print(f"{'Metric':<20}{'Custom Model':<20}{'Sklearn Model'}")
print("="*65)
print(f"{'MSE':<20}{mse_custom:<20.6f}{mse_sklearn:.6f}")
print(f"{'MAE':<20}{mae_custom:<20.6f}{mae_sklearn:.6f}")
print(f"{'R² Score':<20}{r2_custom:<20.6f}{r2_sklearn:.6f}")
print(f"{'Accuracy':<20}{acc_custom:<20.6f}{acc_sklearn:.6f}")
print(f"{'Training Time (s)':<20}{custom_time:<20.6f}{sklearn_time:.6f}")


Early stopping at epoch 85. Best loss: 0.061318

Evaluation Metrics Comparison:
Metric              Custom Model        Sklearn Model
MSE                 0.063270            0.064109
MAE                 0.202562            0.196904
R² Score            0.730673            0.727102
Accuracy            0.956140            0.947368
Training Time (s)   0.024916            0.007611
