In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score


#Load the dataset

data = pd.read_csv("USA_Housing.csv")     # dataset from the given link

# Separate input and output variables
X = data.drop("Price", axis=1).values
y = data["Price"].values.reshape(-1, 1)

# Scale input features for better numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define 5-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

r2_scores = []
beta_values = []

# Step 5: Perform 5-Fold CV manually
fold = 1
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Add a column of ones to include intercept term
    X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]

    # Compute beta using Least Squares formula: β = (XᵀX)⁻¹ Xᵀy
    beta = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train

    # Predict values for the test fold
    y_pred = X_test_b @ beta

    # Calculate R² score for this fold
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    beta_values.append(beta)

    print(f"Fold {fold}: R² Score = {r2:.4f}")
    fold += 1

# Identify best model
best_index = np.argmax(r2_scores)
best_beta = beta_values[best_index]

print("\nBest Fold:", best_index + 1)
print("Highest R² Score:", r2_scores[best_index])
print("Best Beta Matrix:\n", best_beta)

# Use best beta on full data
X_full_b = np.c_[np.ones((X_scaled.shape[0], 1)), X_scaled]
y_pred_full = X_full_b @ best_beta
final_r2 = r2_score(y, y_pred_full)

print("\nFinal R² Score on entire dataset:", final_r2)


Fold 1: R² Score = 0.9180
Fold 2: R² Score = 0.9146
Fold 3: R² Score = 0.9116
Fold 4: R² Score = 0.9193
Fold 5: R² Score = 0.9244

Best Fold: 5
Highest R² Score: 0.9243869413350317
Best Beta Matrix:
 [[1.23161736e+06]
 [2.30225051e+05]
 [1.63956839e+05]
 [1.21115120e+05]
 [7.83467170e+02]
 [1.50662447e+05]]

Final R² Score on entire dataset: 0.918005653868843


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Load and prepare data
data = pd.read_csv("USA_Housing.csv")

X = data.drop("Price", axis=1).values
y = data["Price"].values.reshape(-1, 1)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Split dataset into Train (56%), Validation (14%), Test (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.44, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(30/44), random_state=42)

print("Train size:", X_train.shape[0])
print("Validation size:", X_val.shape[0])
print("Test size:", X_test.shape[0])

# --------------------------------------------------
# Step 3: Add bias term (intercept)
# --------------------------------------------------
X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_val_b = np.c_[np.ones((X_val.shape[0], 1)), X_val]
X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]

# --------------------------------------------------
# Step 4: Gradient Descent Function
# --------------------------------------------------
def gradient_descent(X, y, alpha, iterations):
    n_samples, n_features = X.shape
    beta = np.zeros((n_features, 1))

    for i in range(iterations):
        y_pred = X @ beta
        error = y_pred - y
        gradients = (2/n_samples) * (X.T @ error)
        beta = beta - alpha * gradients
    return beta

# --------------------------------------------------
# Step 5: Try different learning rates
# --------------------------------------------------
learning_rates = [0.001, 0.01, 0.1, 1]
r2_scores = []
betas = []

for lr in learning_rates:
    beta = gradient_descent(X_train_b, y_train, alpha=lr, iterations=1000)
    y_val_pred = X_val_b @ beta
    r2 = r2_score(y_val, y_val_pred)
    r2_scores.append(r2)
    betas.append(beta)
    print(f"Learning rate = {lr}, R² on validation set = {r2:.4f}")

# Pick best learning rate (max R²)
best_index = np.argmax(r2_scores)
best_lr = learning_rates[best_index]
best_beta = betas[best_index]

print("\nBest Learning Rate:", best_lr)
print("Highest Validation R²:", r2_scores[best_index])
print("Best Beta Coefficients:\n", best_beta)

# Evaluate on test set using best beta-
y_test_pred = X_test_b @ best_beta
final_r2 = r2_score(y_test, y_test_pred)

print("\nR² Score on Test Set:", final_r2)
