## Q1: K-Fold Cross Validation for Multiple Linear Regression

In [7]:

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.decomposition import PCA

df = pd.read_csv("USA_Housing.csv")
print(df.columns)
X = df.drop(columns=['Price'])
y = df['Price']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []
coefs = []

for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    coefs.append(model.coef_)

r2_scores, np.mean(r2_scores)



Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price'],
      dtype='object')


([0.9179971706985147,
  0.9145677884802819,
  0.9116116385364478,
  0.9193091764960816,
  0.9243869413350316],
 np.float64(0.9175745431092714))

## Q2: Validation Set for Multiple Linear Regression using Gradient Descent

In [8]:

X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.44, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=30/44, random_state=42)

learning_rates = [0.001, 0.01, 0.1, 1]
results = {}

for lr in learning_rates:
    sgd = SGDRegressor(max_iter=1000, eta0=lr, learning_rate='constant', penalty=None, random_state=42)
    sgd.fit(X_train, y_train)
    y_val_pred = sgd.predict(X_val)
    y_test_pred = sgd.predict(X_test)
    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)
    results[lr] = {
        "coefficients": sgd.coef_,
        "val_R2": r2_val,
        "test_R2": r2_test
    }

results


{0.001: {'coefficients': array([231091.21597604, 165206.13447671, 118549.75471073,   1909.04903299,
         152032.00685444]),
  'val_R2': 0.9200819101019124,
  'test_R2': 0.9131986101163132},
 0.01: {'coefficients': array([215186.76086019, 168639.91875574, 117859.63496216,  16619.91282014,
         159731.30385266]),
  'val_R2': 0.9143839741460056,
  'test_R2': 0.9100609895521061},
 0.1: {'coefficients': array([277744.85026558, 144237.01668131, 148253.7083827 ,   2560.08496823,
         156927.53459416]),
  'val_R2': 0.889345887368122,
  'test_R2': 0.8864945907011517},
 1: {'coefficients': array([-6.21098064e+11, -1.25286847e+12,  1.19821563e+12, -8.62686101e+11,
          2.06818583e+12]),
  'val_R2': -122806747379446.52,
  'test_R2': -108816578308372.25}}

## Q3: Preprocessing and Multiple Linear Regression on Car Dataset

In [10]:


cols = ["symboling","normalized_losses","make","fuel_type","aspiration","num_doors","body_style","drive_wheels","engine_location",
        "wheel_base","length","width","height","curb_weight","engine_type","num_cylinders","engine_size","fuel_system","bore","stroke",
        "compression_ratio","horsepower","peak_rpm","city_mpg","highway_mpg","price"]
df_car = pd.read_csv("imports-85.txt", names=cols, na_values=["?"])
df_car.columns = cols



In [11]:
df_car.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [12]:
df_car = df_car.dropna(subset=["price"])
df_car.fillna(df_car.median(numeric_only=True), inplace=True)

In [13]:
from sklearn.preprocessing import LabelEncoder

df_car["num_doors"] = df_car["num_doors"].replace({"two":2, "four":4})
df_car["num_cylinders"] = df_car["num_cylinders"].replace(
    {"two":2,"three":3,"four":4,"five":5,"six":6,"eight":8,"twelve":12}
)

df_car = pd.get_dummies(df_car, columns=["body_style","drive_wheels"])

le = LabelEncoder()
for col in ["make","aspiration","engine_location","fuel_type"]:
    df_car[col] = le.fit_transform(df_car[col].astype(str))

df_car["fuel_system"] = df_car["fuel_system"].astype(str).apply(lambda x: 1 if "pfi" in x else 0)
df_car["engine_type"] = df_car["engine_type"].astype(str).apply(lambda x: 1 if "ohc" in x else 0)


  df_car["num_doors"] = df_car["num_doors"].replace({"two":2, "four":4})
  df_car["num_cylinders"] = df_car["num_cylinders"].replace(


In [14]:
from sklearn.preprocessing import StandardScaler

X = df_car.drop(columns=["price"])
y = df_car["price"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
print(X_scaled)
print(y.values)

[[ 1.72504964 -0.17830627 -1.97171697 ... -0.20359464 -1.19234506
   1.29614814]
 [ 1.72504964 -0.17830627 -1.97171697 ... -0.20359464 -1.19234506
   1.29614814]
 [ 0.1271926  -0.17830627 -1.97171697 ... -0.20359464 -1.19234506
   1.29614814]
 ...
 [-1.47066444 -0.80268884  1.39503761 ... -0.20359464 -1.19234506
   1.29614814]
 [-1.47066444 -0.80268884  1.39503761 ... -0.20359464 -1.19234506
   1.29614814]
 [-1.47066444 -0.80268884  1.39503761 ... -0.20359464 -1.19234506
   1.29614814]]
[13495. 16500. 16500. 13950. 17450. 15250. 17710. 18920. 23875. 16430.
 16925. 20970. 21105. 24565. 30760. 41315. 36880.  5151.  6295.  6575.
  5572.  6377.  7957.  6229.  6692.  7609.  8558.  8921. 12964.  6479.
  6855.  5399.  6529.  7129.  7295.  7295.  7895.  9095.  8845. 10295.
 12945. 10345.  6785. 11048. 32250. 35550. 36000.  5195.  6095.  6795.
  6695.  7395. 10945. 11845. 13645. 15645.  8845.  8495. 10595. 10245.
 10795. 11245. 18280. 18344. 25552. 28248. 28176. 31600. 34184. 35056.
 40960. 454

In [17]:
X = pd.DataFrame(X_scaled) 
y = pd.to_numeric(y, errors='coerce')
y = y.fillna(np.nan).dropna()

In [19]:
X = X.apply(pd.to_numeric, errors='coerce')  # Convert all to numbers, invalid → NaN
X = X.fillna(0)

In [20]:
y = pd.to_numeric(y, errors='coerce')

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("R² Score:", r2_score(y_test, y_pred))


R² Score: 0.8731212629008539


In [22]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train)
y_pred_pca = lr_pca.predict(X_test_pca)

print("Original R² Score:", r2_score(y_test, y_pred))
print("PCA Reduced R² Score:", r2_score(y_test, y_pred_pca))

Original R² Score: 0.8731212629008539
PCA Reduced R² Score: 0.8491442176401741
