In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

# Load the dataset
df = pd.read_csv("synthetic_regression_dataset.csv")

# Define features and target
X = df.drop(columns=["Target"])
y = df["Target"]

# Define the model
model = LinearRegression()

# Set up K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using cross-validation
# We'll use R^2 and Negative Mean Squared Error (neg MSE) as metrics
r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
neg_mse_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')

# Print evaluation results
print("Cross-Validation R^2 Scores:", r2_scores)
print("Average R^2 Score:", r2_scores.mean())
print("Cross-Validation MSE Scores (negative):", neg_mse_scores)
print("Average MSE:", -neg_mse_scores.mean())


Cross-Validation R^2 Scores: [0.98724345 0.98758471 0.98822989 0.98752566 0.98697429]
Average R^2 Score: 0.9875116009660385
Cross-Validation MSE Scores (negative): [-214.01808548 -217.27187725 -213.34904857 -209.87090827 -230.64397433]
Average MSE: 217.03077878077974


## Questions-  
1. What is the purpose of using KFold with n_splits=5 and shuffle=True in the cross-validation setup?
2. Why are both R^2 and negative mean squared error used as scoring metrics in the cross_val_score function?
3. How does the code calculate the average mean squared error (MSE) from the negative MSE scores obtained during cross-validation?