# Linear Regression Functions

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold

In [3]:
#@title Figure settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use("https://raw.githubusercontent.com/NeuromatchAcademy/course-content/master/nma.mplstyle")

## **Training and Fitting**

In [4]:
def train_linear_regression(X_train, y_train):
    """
    Trains a linear regression model.

    Parameters:
    X_train (numpy array): Training feature matrix.
    y_train (numpy array): Training target vector.

    Returns:
    model: Trained linear regression model.
    """
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

def predict(model, X_test):
    """
    Makes predictions using the trained linear regression model.

    Parameters:
    model: Trained linear regression model.
    X_test (numpy array): Testing feature matrix.

    Returns:
    y_pred (numpy array): Predicted target vector.
    """
    y_pred = model.predict(X_test)
    return y_pred

## **Evaluation Methods**

In [5]:
def split_data(X, y, test_size=0.2, random_state=42):
    """
    Splits the data into training and testing sets.

    Parameters:
    X (numpy array): Feature matrix.
    y (numpy array): Target vector.
    test_size (float): Proportion of the dataset to include in the test split.
    random_state (int): Seed used by the random number generator.

    Returns:
    X_train, X_test, y_train, y_test: Split data.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def evaluate_model_mse(y_test, y_pred):
    """
    Evaluates the model performance.

    Parameters:
    y_test (numpy array): True target vector.
    y_pred (numpy array): Predicted target vector.

    Returns:
    mse (float): Mean Squared Error.
    r2 (float): R-squared value.
    """
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

def kfold_cross_validation(X, y, k=10):
    """
    Performs 10-fold cross-validation for linear regression.

    Parameters:
    X (numpy array): Feature matrix.
    y (numpy array): Target vector.
    k (int): Number of folds. Default is 10.

    Returns:
    float: Mean R^2 score across folds.
    """
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    r2_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        theta = train_linear_regression(X_train, y_train)
        X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred = X_test_b.dot(theta)

        # Compute R^2 score
        ss_res = np.sum((y_test - y_pred) ** 2)
        ss_tot = np.sum((y_test - np.mean(y_test)) ** 2)
        r2 = 1 - (ss_res / ss_tot)
        r2_scores.append(r2)

    return np.mean(r2_scores)