In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

In [19]:
df = pd.read_csv('USA_Housing.csv')
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


In [20]:
x = df.drop("Price",axis=1).values
y = df["Price"].values

In [21]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [22]:
kf = KFold(n_splits=5,shuffle=True,random_state=42)

In [23]:
r2_scores = []
beta_list = []


In [24]:
for train_index,test_index in kf.split(x_scaled):
    x_train, x_test = x_scaled[train_index] , x_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    x_train_bias = np.c_[np.ones(x_train.shape[0]),x_train]
    x_test_bias = np.c_[np.ones(x_test.shape[0]),x_test]

    beta = np.linalg.inv(x_train_bias.T.dot(x_train_bias)).dot(x_train_bias.T).dot(y_train)
    beta_list.append(beta)

    yPred = x_test_bias.dot(beta)
    r2 = r2_score(y_test,yPred)
    r2_scores.append(r2)
    

In [26]:
best_index = np.argmax(r2_scores)
best_beta = beta_list[best_index]

In [27]:
print("R2 scores for each fold:", r2_scores)
print("Best R2 score:", r2_scores[best_index])
print("Best β matrix:\n", best_beta)

R2 scores for each fold: [0.9179971706985147, 0.9145677884802818, 0.9116116385364478, 0.9193091764960816, 0.9243869413350316]
Best R2 score: 0.9243869413350316
Best β matrix:
 [1.23161736e+06 2.30225051e+05 1.63956839e+05 1.21115120e+05
 7.83467170e+02 1.50662447e+05]
