In [2]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

def k_fold_cross_validation(data, target, k=5):
    """
    Perform k-fold cross-validation on a dataset with linear regression.

    :param data: Features of the dataset (numpy array or pandas DataFrame).
    :param target: Target values (numpy array or pandas Series).
    :param k: Number of folds for cross-validation (default is 5).
    :return: Mean and standard deviation of RMSE scores across folds.
    """
    # Initialize KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # Initialize an empty list to store RMSE for each fold
    rmse_scores = []

    for fold, (train_index, test_index) in enumerate(kf.split(data)):
        # Split data into training and validation sets
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]

        # Initialize and train the model
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Make predictions on the validation set
        predictions = model.predict(X_test)

        # Calculate RMSE for this fold
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        print(f"Fold {fold + 1}: RMSE = {rmse:.4f}")

    # Calculate mean and standard deviation of RMSE
    mean_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)

    print("\nCross-Validation Results:")
    print(f"Mean RMSE: {mean_rmse:.4f}")
    print(f"Standard Deviation of RMSE: {std_rmse:.4f}")

    return mean_rmse, std_rmse

# Example: Using the california Housing Dataset
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
print(housing.data.shape, housing.target.shape)


# Load dataset

X = housing.data
y = housing.target

# Perform k-fold cross-validation
k_fold_cross_validation(X, y, k=5)


(20640, 8) (20640,)
Fold 1: RMSE = 0.7456
Fold 2: RMSE = 0.7264
Fold 3: RMSE = 0.7136
Fold 4: RMSE = 0.7105
Fold 5: RMSE = 0.7451

Cross-Validation Results:
Mean RMSE: 0.7283
Standard Deviation of RMSE: 0.0149


(0.7282509142479741, 0.014935522116012871)