In [None]:
import numpy as np
import pandas as pd

class LinearRegressionModel:
    def fit(self, X, y):

        X = np.c_[np.ones(X.shape[0]), X] 
        self.weights = np.linalg.inv(X.T @ X) @ X.T @ y

    def predict(self, X):

        X = np.c_[np.ones(X.shape[0]), X]  
        return X @ self.weights


class Metrics:
    @staticmethod
    def mean_squared_error(y_true, y_pred):

        return np.mean((y_true - y_pred) ** 2)

    @staticmethod
    def r_squared(y_true, y_pred):

        ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
        ss_residual = np.sum((y_true - y_pred) ** 2)
        return 1 - (ss_residual / ss_total)

class CrossValidation:
    def __init__(self, k):
        self.k = k

    def k_fold_cv(self, X, y):

        indices = np.arange(len(y))
        np.random.seed(42)
        np.random.shuffle(indices)
        fold_size = len(y) // self.k
        mse_scores = []
        r2_scores = []

        for i in range(self.k):
            val_indices = indices[i * fold_size:(i + 1) * fold_size]
            train_indices = np.setdiff1d(indices, val_indices)

            X_train, X_val = X[train_indices], X[val_indices]
            y_train, y_val = y[train_indices], y[val_indices]

            model = LinearRegressionModel()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)

            mse_scores.append(Metrics.mean_squared_error(y_val, y_pred))
            r2_scores.append(Metrics.r_squared(y_val, y_pred))

        return np.mean(mse_scores), np.mean(r2_scores)

class Bootstrapping:
    def __init__(self, n_iterations):
        self.n_iterations = n_iterations

    def bootstrap(self, X, y):

        n_samples = len(y)
        mse_scores = []
        r2_scores = []

        for _ in range(self.n_iterations):
            bootstrap_indices = np.random.choice(range(n_samples), size=n_samples, replace=True)
            oob_indices = np.setdiff1d(np.arange(n_samples), bootstrap_indices)

            if len(oob_indices) == 0:
                continue

            X_train, X_val = X[bootstrap_indices], X[oob_indices]
            y_train, y_val = y[bootstrap_indices], y[oob_indices]

            model = LinearRegressionModel()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)


            mse_scores.append(Metrics.mean_squared_error(y_val, y_pred))
            r2_scores.append(Metrics.r_squared(y_val, y_pred))

        return np.mean(mse_scores), np.mean(r2_scores)


class DataGenerator:
    def __init__(self, rows, cols, noise=0.4):
        self.rows = rows
        self.cols = cols
        self.noise = noise

    def gen_data(self):
        np.random.seed(10)
        X = np.random.randn(self.rows, self.cols)

        true_coeff = np.random.randn(self.cols)
        true_coeff[2:5] = 0  # Introduce collinearity

        noise = np.random.randn(self.rows) * self.noise
        y = np.dot(X, true_coeff) + noise

        df = pd.DataFrame(X, columns=[f'feature_{i+1}' for i in range(self.cols)])
        df['target'] = y

        X = df.drop("target", axis=1).values
        y = df["target"].values
        
        return X, y


if __name__ == "__main__":

    generator = DataGenerator(rows=100, cols=10)
    X, y = generator.gen_data()


    cv = CrossValidation(k=5)
    mse_cv, r2_cv = cv.k_fold_cv(X, y)
    print(f"Average k-Fold CV MSE: {mse_cv:.4f}")
    print(f"Average k-Fold CV R-Squared: {r2_cv:.4f}")


    bs = Bootstrapping(n_iterations=100)
    mse_bs, r2_bs = bs.bootstrap(X, y)
    print(f"Average Bootstrap MSE: {mse_bs:.4f}")
    print(f"Average Bootstrap R-Squared: {r2_bs:.4f}")


Average k-Fold CV MSE: 0.2110
Average k-Fold CV R-Squared: 0.9466
Average Bootstrap MSE: 0.2197
Average Bootstrap R-Squared: 0.9442


In [None]:
import numpy as np
import pandas as pd

# Linear Regression Model Class
class LinearRegressionModel:
    def fit(self, X, y):

        X = np.c_[np.ones(X.shape[0]), X]  # Add intercept term
        self.weights = np.linalg.inv(X.T @ X) @ X.T @ y

    def predict(self, X):

        X = np.c_[np.ones(X.shape[0]), X]
        return X @ self.weights


class Metrics:
    @staticmethod
    def mean_squared_error(y_true, y_pred):

        return np.mean((y_true - y_pred) ** 2)

    @staticmethod
    def r_squared(y_true, y_pred):

        ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
        ss_residual = np.sum((y_true - y_pred) ** 2)
        return 1 - (ss_residual / ss_total)

class CrossValidation:
    def __init__(self, k):
        self.k = k

    def k_fold_cv(self, X, y):

        indices = np.arange(len(y))
        np.random.seed(42)
        np.random.shuffle(indices)
        fold_size = len(y) // self.k
        mse_scores = []
        r2_scores = []

        for i in range(self.k):
            val_indices = indices[i * fold_size:(i + 1) * fold_size]
            train_indices = np.setdiff1d(indices, val_indices)

            X_train, X_val = X[train_indices], X[val_indices]
            y_train, y_val = y[train_indices], y[val_indices]


            model = LinearRegressionModel()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)


            mse_scores.append(Metrics.mean_squared_error(y_val, y_pred))
            r2_scores.append(Metrics.r_squared(y_val, y_pred))

        return np.mean(mse_scores), np.mean(r2_scores)


class Bootstrapping:
    def __init__(self, n_iterations):
        self.n_iterations = n_iterations

    def bootstrap(self, X, y):

        n_samples = len(y)
        mse_scores = []
        r2_scores = []

        for _ in range(self.n_iterations):
            bootstrap_indices = np.random.choice(range(n_samples), size=n_samples, replace=True)
            oob_indices = np.setdiff1d(np.arange(n_samples), bootstrap_indices)

            if len(oob_indices) == 0:
                continue

            X_train, X_val = X[bootstrap_indices], X[oob_indices]
            y_train, y_val = y[bootstrap_indices], y[oob_indices]

            model = LinearRegressionModel()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)

            mse_scores.append(Metrics.mean_squared_error(y_val, y_pred))
            r2_scores.append(Metrics.r_squared(y_val, y_pred))

        return np.mean(mse_scores), np.mean(r2_scores)


class ProfessorData:    
    def __init__(self, m, N, b, rnge=[-10, 10], scale=1, random_seed=8675309):
        self.scale = scale
        self.rnge = rnge
        self.m = m
        self.N = N
        self.b = b
        self.random_seed = random_seed    

    def linear_data_generator(self):

        rng = np.random.default_rng(seed=self.random_seed)
        sample = rng.uniform(low=self.rnge[0], high=self.rnge[1], size=(self.N, len(self.m)))
        m_reshaped = np.array(self.m).reshape(-1, 1)
        ys = np.dot(sample, m_reshaped) + self.b
        noise = rng.normal(loc=0., scale=self.scale, size=ys.shape)
        return sample, (ys + noise).flatten()


if __name__ == "__main__":

    generator = ProfessorData(m=[1, -2, 3, 0, 0, 0, 0, 0, 0, 0], N=100, b=5, scale=0.5)
    X, y = generator.linear_data_generator()


    cv = CrossValidation(k=5)
    mse_cv, r2_cv = cv.k_fold_cv(X, y)
    print(f"Average k-Fold CV MSE: {mse_cv:.4f}")
    print(f"Average k-Fold CV R-Squared: {r2_cv:.4f}")


    bs = Bootstrapping(n_iterations=100)
    mse_bs, r2_bs = bs.bootstrap(X, y)
    print(f"Average Bootstrap MSE: {mse_bs:.4f}")
    print(f"Average Bootstrap R-Squared: {r2_bs:.4f}")


Average k-Fold CV MSE: 0.2281
Average k-Fold CV R-Squared: 0.9995
Average Bootstrap MSE: 0.2477
Average Bootstrap R-Squared: 0.9995
