In [6]:
# Q1

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

# Part (a) Load dataset and separate input/output
df = pd.read_csv("C:/Users/kanav/Downloads/USA_Housing.csv")
X = df.drop(columns=["Price"])
y = df["Price"].values.reshape(-1, 1)
print(X)
print(y)

# Part (b) Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)

# Part (c) Create folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)
print(kf)

# Part (d) Perform 5-fold cross validation
best_beta = None
best_r2 = -np.inf

fold = 1
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train_b = np.c_[np.ones(X_train.shape[0]), X_train]
    X_test_b = np.c_[np.ones(X_test.shape[0]), X_test]
    
    beta = np.linalg.inv(X_train_b.T @ X_train_b) @ (X_train_b.T @ y_train)
    
    y_pred = X_test_b @ beta
    r2 = r2_score(y_test, y_pred)
    
    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta
    
    fold += 1

# Part (e) Train final model using best beta
X_b = np.c_[np.ones(X_scaled.shape[0]), X_scaled]
final_predictions = X_b @ best_beta

print("Best R2 Score:", best_r2)
print("Best Beta Matrix:\n", best_beta)


      Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
0          79545.45857             5.682861                   7.009188   
1          79248.64245             6.002900                   6.730821   
2          61287.06718             5.865890                   8.512727   
3          63345.24005             7.188236                   5.586729   
4          59982.19723             5.040555                   7.839388   
...                ...                  ...                        ...   
4995       60567.94414             7.830362                   6.137356   
4996       78491.27543             6.999135                   6.576763   
4997       63390.68689             7.250591                   4.805081   
4998       68001.33124             5.534388                   7.130144   
4999       65510.58180             5.992305                   6.792336   

      Avg. Area Number of Bedrooms  Area Population  
0                             4.09      23086.80050  
1  

In [22]:
#Q2
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split


df = pd.read_csv("C:/Users/kanav/Downloads/USA_Housing.csv")

X = df.drop(['Price'], axis=1).values   # <-- Correct column name
y = df['Price'].values.reshape(-1, 1)   # <-- Correct column name


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.20, random_state=42)


X_train_b = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_val_b = np.hstack([np.ones((X_val.shape[0], 1)), X_val])
X_test_b = np.hstack([np.ones((X_test.shape[0], 1)), X_test])


def gradient_descent(X, y, lr, n_iter):
    m, n = X.shape
    beta = np.zeros((n, 1))
    for _ in range(n_iter):
        y_pred = X @ beta
        error = y_pred - y
        grad = (X.T @ error) / m
        beta -= lr * grad
    return beta


lrs = [0.001, 0.01, 0.1, 1]
best_r2_val = -np.inf
best_beta = None
best_lr = None

for lr in lrs:
    beta = gradient_descent(X_train_b, y_train, lr, 1000)

    y_pred_val = X_val_b @ beta
    y_pred_test = X_test_b @ beta

    r2_val = r2_score(y_val, y_pred_val)
    r2_test = r2_score(y_test, y_pred_test)

    print(f"Learning rate: {lr}")
    print(f"Validation R2: {r2_val:.4f}")
    print(f"Test R2: {r2_test:.4f}")
    print(f"Beta: {beta.ravel()}\n")

    if r2_val > best_r2_val:
        best_r2_val = r2_val
        best_beta = beta
        best_lr = lr

# Final best model
y_pred_best_test = X_test_b @ best_beta
best_test_r2 = r2_score(y_test, y_pred_best_test)

print("Best validation R2:", best_r2_val)
print("Best learning rate:", best_lr)
print("Best beta:", best_beta.ravel())
print("Best Test R2 with best beta:", best_test_r2)


Learning rate: 0.001
Validation R2: -0.8125
Test R2: -0.9914
Beta: [778829.54135369 145453.10988562 101705.72638779  81088.52016625
  32930.95853355  88928.37718239]

Learning rate: 0.01
Validation R2: 0.9098
Test R2: 0.9147
Beta: [1232562.51254919  230048.76664688  163686.93503606  121406.94107918
    3117.47363933  150655.97459714]

Learning rate: 0.1
Validation R2: 0.9098
Test R2: 0.9148
Beta: [1232618.32011841  230067.9889464   163710.33259401  121681.42752283
    2832.15066521  150657.52262836]

Learning rate: 1
Validation R2: 0.9098
Test R2: 0.9148
Beta: [1232618.32011841  230067.9889464   163710.33259401  121681.42752284
    2832.15066521  150657.52262836]

Best validation R2: 0.9098183094422969
Best learning rate: 0.01
Best beta: [1232562.51254919  230048.76664688  163686.93503606  121406.94107918
    3117.47363933  150655.97459714]
Best Test R2 with best beta: 0.9147434800538763


In [30]:
#Q3
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

column_names = [
    "symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors", "body_style",
    "drive_wheels", "engine_location", "wheel_base", "length", "width", "height", "curb_weight",
    "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",
    "compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"
]

df = pd.read_csv("C:/Users/kanav/Downloads/imports-85.data.txt", 
                 names=column_names, na_values='?')

for col in df.columns:
    if df[col].dtype == 'O':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

df.dropna(subset=['price'], inplace=True)
df['price'] = pd.to_numeric(df['price']) 

doors_map = {'two': 2, 'four': 4}
cyl_map = {'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12}
df['num_doors'] = df['num_doors'].map(doors_map).astype(int)
df['num_cylinders'] = df['num_cylinders'].map(cyl_map).astype(int)

df = pd.get_dummies(df, columns=['body_style', 'drive_wheels'])

for col in ['make', 'aspiration', 'engine_location', 'fuel_type']:
    df[col] = LabelEncoder().fit_transform(df[col])

df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in x else 0)
df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in x else 0)

for col in df.columns:
    if df[col].dtype == 'O':
        df[col] = pd.to_numeric(df[col], errors='coerce')

X = df.drop(['price'], axis=1).values
y = df['price'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
r2_initial = r2_score(y_test, y_pred)
print("R2 score without PCA:", r2_initial)

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(
    X_pca, y, test_size=0.3, random_state=42
)

lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train_pca)
y_pred_pca = lr_pca.predict(X_test_pca)
r2_pca = r2_score(y_test_pca, y_pred_pca)
print("R2 score with PCA:", r2_pca)


R2 score without PCA: 0.7962231220908706
R2 score with PCA: 0.7579526472850189
