In [9]:
# Google Colab Setup
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

#Q1
df_usahouse = pd.read_csv('/content/drive/MyDrive/USA_Housing.csv')
# a)
X = df_usahouse.drop('Price', axis=1).values
y = df_usahouse['Price'].values
# b)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# c)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []
beta_matrices = []
y_preds_folds = []
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
    beta = np.linalg.inv(X_train_b.T.dot(X_train_b)).dot(X_train_b.T).dot(y_train)
    y_pred = X_test_b.dot(beta)

    # R2 Score
    score = r2_score(y_test, y_pred)
    r2_scores.append(score)
    beta_matrices.append(beta)
    y_preds_folds.append(y_pred)

print("R2 scores for each fold:", r2_scores)
best_index = np.argmax(r2_scores)
print("Best Fold index (max R2):", best_index)

# e) Use best beta to train on 70% and test on 30%
X_train_70, X_test_30, y_train_70, y_test_30 = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

X_train_70_b = np.c_[np.ones((X_train_70.shape[0], 1)), X_train_70]
X_test_30_b = np.c_[np.ones((X_test_30.shape[0], 1)), X_test_30]

best_beta = beta_matrices[best_index]
y_pred_70_30 = X_test_30_b.dot(best_beta)
test_r2 = r2_score(y_test_30, y_pred_70_30)

print("Test R2 score on 30% data using best beta:", test_r2)

# Q2
def gradient_descent(X, y, learning_rate=0.01, epochs=1000):
    m, n = X.shape
    theta = np.zeros(n)
    r2_scores = []

    for epoch in range(epochs):
        y_pred = X.dot(theta)
        error = y_pred - y
        gradient = (2 / m) * X.T.dot(error)
        theta = theta - learning_rate * gradient
        score = 1 - (np.sum(error**2) / np.sum((y - np.mean(y))**2))
        r2_scores.append(score)
    return theta, r2_scores

for lr in [0.01, 0.1, 0.001]:
    print(f"Learning rate: {lr}")
    theta, scores = gradient_descent(X_train_70_b, y_train_70, learning_rate=lr, epochs=1000)
    print(f"Final R2 score on training data after 1000 epochs: {scores[-1]}")
y_test_pred = X_test_30_b.dot(theta)
print("Test R2 score using GD:", r2_score(y_test_30, y_test_pred))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
R2 scores for each fold: [0.9179971706985147, 0.9145677884802818, 0.9116116385364478, 0.9193091764960816, 0.9243869413350316]
Best Fold index (max R2): 4
Test R2 score on 30% data using best beta: 0.9147458156636434
Learning rate: 0.01
Final R2 score on training data after 1000 epochs: 0.9192986578820134
Learning rate: 0.1
Final R2 score on training data after 1000 epochs: 0.9192986579053273
Learning rate: 0.001
Final R2 score on training data after 1000 epochs: 0.6786104877965986
Test R2 score using GD: 0.6435914890564657


In [10]:
# Question 3: Car Price Prediction (Preprocessing + Regression + PCA)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
columns = ["symboling", "normalizedlosses", "make", "fueltype", "aspiration",
           "numdoors", "bodystyle", "drivewheels", "enginelocation", "wheelbase", "length", "width", "height", "curbweight",
           "enginetype", "numcylinders", "enginesize", "fuelsystem", "bore", "stroke", "compressionratio", "horsepower",
           "peakrpm", "citympg", "highwaympg", "price"]
data = pd.read_csv(url, names=columns, na_values='?')

data['price'] = pd.to_numeric(data['price'], errors='coerce')
data = data.dropna(subset=['price'])
for col in data.columns:
    if data[col].dtype == 'O':
        mode = data[col].mode()
        if not mode.empty:
            data[col] = data[col].fillna(mode[0])
    else:
        data[col] = data[col].fillna(data[col].mean())

doors_dict = {'two':2, 'four':4}
cyl_dict = {'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'eight':8, 'twelve':12}
data['numdoors'] = data['numdoors'].map(doors_dict)
data['numcylinders'] = data['numcylinders'].map(cyl_dict)

data = pd.get_dummies(data, columns=['bodystyle', 'drivewheels'], drop_first=True)

for col in ['make', 'aspiration', 'enginelocation', 'fueltype']:
    data[col] = LabelEncoder().fit_transform(data[col].astype(str))

data['fuelsystem'] = data['fuelsystem'].apply(lambda x: 1 if 'pfi' in str(x) else 0)
data['enginetype'] = data['enginetype'].apply(lambda x: 1 if 'ohc' in str(x) else 0)

X = data.drop('price', axis=1).values
y = data['price'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
result1 = r2_score(y_test, y_pred)
print("R2 score without PCA:", result1)

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

model_pca = LinearRegression()
model_pca.fit(X_train_pca, y_train)
y_pred_pca = model_pca.predict(X_test_pca)
result2 = r2_score(y_test, y_pred_pca)
print("R2 score with PCA:", result2)


R2 score without PCA: 0.8732775682086293
R2 score with PCA: 0.8567121623341896
