In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

## Q1 - Step a: Load USA House Price Dataset and Split Features/Target


In [2]:
# Load dataset
url_house = "https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX"
house_data = pd.read_csv(url_house)

# Input features (all except Price)
X = house_data.drop('Price', axis=1).values
y = house_data['Price'].values.reshape(-1,1)

print("Input shape:", X.shape)
print("Output shape:", y.shape)

Input shape: (5000, 5)
Output shape: (5000, 1)


## Q1 - Step b: Scale Input Features


In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Q1 - Step c & d: 5-Fold Cross Validation using Least Squares Fit


In [4]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

r2_scores = []
beta_list = []

for i, (train_index, test_index) in enumerate(kf.split(X_scaled)):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Add bias term
    X_train_bias = np.hstack((np.ones((X_train.shape[0],1)), X_train))
    X_test_bias = np.hstack((np.ones((X_test.shape[0],1)), X_test))

    # Least Squares: β = (X^T X)^(-1) X^T y
    beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ (X_train_bias.T @ y_train)

    # Predict
    y_pred = X_test_bias @ beta
    r2 = r2_score(y_test, y_pred)

    r2_scores.append(r2)
    beta_list.append(beta)

    print(f"Fold {i+1}: R² score = {r2:.4f}")

Fold 1: R² score = 0.9180
Fold 2: R² score = 0.9146
Fold 3: R² score = 0.9116
Fold 4: R² score = 0.9193
Fold 5: R² score = 0.9244


## Q1 - Step e: Select Best Beta and Train on Full Dataset


In [5]:
best_index = np.argmax(r2_scores)
best_beta = beta_list[best_index]
print(f"Best R2_score (Fold): {r2_scores[best_index]:.4f}")

# Train on full dataset using best beta
X_bias_full = np.hstack((np.ones((X_scaled.shape[0],1)), X_scaled))
y_pred_full = X_bias_full @ best_beta
final_r2 = r2_score(y, y_pred_full)
print(f"R2 score on full dataset: {final_r2:.4f}")

Best R2_score (Fold): 0.9244
R2 score on full dataset: 0.9180


## Q2 - Step 1: Split Dataset into Training (56%), Validation (14%), Test (30%)


In [6]:
# Split train_temp (70%) and test (30%)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# Split train_temp into train (56%) and validation (14%)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.2, random_state=42
)

print("Train set:", X_train.shape)
print("Validation set:", X_val.shape)
print("Test set:", X_test.shape)

Train set: (2800, 5)
Validation set: (700, 5)
Test set: (1500, 5)


## Q2 - Step 2: Define Gradient Descent Function


In [7]:
def gradient_descent(X, y, lr=0.01, iterations=1000):
    m, n = X.shape
    beta = np.zeros((n,1))

    for _ in range(iterations):
        y_pred = X @ beta
        gradient = (1/m) * (X.T @ (y_pred - y))
        beta -= lr * gradient

    return beta

## Q2 - Step 3: Add Bias Term to Input Features


In [8]:
X_train_bias = np.hstack((np.ones((X_train.shape[0],1)), X_train))
X_val_bias = np.hstack((np.ones((X_val.shape[0],1)), X_val))
X_test_bias = np.hstack((np.ones((X_test.shape[0],1)), X_test))

## Q2 - Step 4: Train Using Gradient Descent for Different Learning Rates


In [9]:
learning_rates = [0.001, 0.01, 0.1, 1]
results = []

for lr in learning_rates:
    beta = gradient_descent(X_train_bias, y_train, lr=lr, iterations=1000)

    y_val_pred = X_val_bias @ beta
    y_test_pred = X_test_bias @ beta

    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)

    results.append({'learning_rate': lr, 'beta': beta, 'R2_val': r2_val, 'R2_test': r2_test})

    print(f"Learning rate: {lr}, R² val: {r2_val:.4f}, R² test: {r2_test:.4f}")

Learning rate: 0.001, R² val: -0.8125, R² test: -0.9914
Learning rate: 0.01, R² val: 0.9098, R² test: 0.9147
Learning rate: 0.1, R² val: 0.9098, R² test: 0.9148
Learning rate: 1, R² val: 0.9098, R² test: 0.9148


## Q2 - Step 5: Find Best Learning Rate Based on Validation Set


In [10]:
best_result = max(results, key=lambda x: x['R2_val'])
print("Best learning rate:", best_result['learning_rate'])
print("Validation R²:", best_result['R2_val'])
print("Test R²:", best_result['R2_test'])
print("Regression coefficients (β):\n", best_result['beta'])

Best learning rate: 0.01
Validation R²: 0.9098183094422969
Test R²: 0.9147434800538763
Regression coefficients (β):
 [[1232562.51254919]
 [ 230048.76664688]
 [ 163686.93503606]
 [ 121406.94107918]
 [   3117.47363933]
 [ 150655.97459714]]


## Q3 - Step 1 & 2: Load Dataset and Handle Missing Values


In [11]:
# Column names
columns_car = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
               "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base",
               "length", "width", "height", "curb_weight", "engine_type", "num_cylinders",
               "engine_size", "fuel_system", "bore", "stroke", "compression_ratio",
               "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

# Load dataset
url_car = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
car_data = pd.read_csv(url_car, names=columns_car, na_values='?')

# Convert numeric columns to float
numeric_cols_car = ["symboling", "normalized_losses", "wheel_base", "length", "width", "height",
                    "curb_weight", "engine_size", "bore", "stroke", "compression_ratio",
                    "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

for col in numeric_cols_car:
    car_data[col] = pd.to_numeric(car_data[col], errors='coerce')

# Impute missing numeric values
car_data[numeric_cols_car] = car_data[numeric_cols_car].fillna(car_data[numeric_cols_car].mean())

# Drop rows with NaN price
car_data = car_data.dropna(subset=['price'])
car_data['price'] = car_data['price'].astype(float)

## Q3 - Step 3: Convert Non-Numeric Columns to Numeric


In [12]:
# num_doors and num_cylinders
word_to_num = {"two":2, "three":3, "four":4, "five":5, "six":6, "eight":8, "twelve":12}
car_data['num_doors'] = car_data['num_doors'].map(word_to_num)
car_data['num_cylinders'] = car_data['num_cylinders'].map(word_to_num)

# Dummy encoding for body_style and drive_wheels
car_data = pd.get_dummies(car_data, columns=['body_style', 'drive_wheels'], drop_first=True)

# Label encoding for make, aspiration, engine_location, fuel_type
label_cols = ['make', 'aspiration', 'engine_location', 'fuel_type']
le = LabelEncoder()
for col in label_cols:
    car_data[col] = le.fit_transform(car_data[col])

# Fuel system: pfi -> 1, else 0
car_data['fuel_system'] = car_data['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x).lower() else 0)

# Engine type: ohc -> 1, else 0
car_data['engine_type'] = car_data['engine_type'].apply(lambda x: 1 if 'ohc' in str(x).lower() else 0)

## Q3 - Step 4: Divide into Input Features (X) and Output (y) and Scale Features


In [13]:
# Input features and target
X_car = car_data.drop('price', axis=1).values
y_car = car_data['price'].values.reshape(-1,1)

# Scale features
X_car_scaled = scaler.fit_transform(X_car)  # reuse the scaler from Q1/Q2

## Q3 - Step 5: Train Linear Regression and Test Performance


In [14]:
# Ensure all numeric columns in car_data
numeric_cols_car = car_data.drop('price', axis=1).columns

for col in numeric_cols_car:
    car_data[col] = pd.to_numeric(car_data[col], errors='coerce')

# Drop any rows with NaN (safest)
car_data = car_data.dropna(subset=numeric_cols_car)

# Now separate X and y
X_car = car_data.drop('price', axis=1).values
y_car = car_data['price'].values.reshape(-1,1)

# Scale features
X_car_scaled = scaler.fit_transform(X_car)

X_train_car, X_test_car, y_train_car, y_test_car = train_test_split(
    X_car_scaled, y_car, test_size=0.3, random_state=42
)

lr_car = LinearRegression()
lr_car.fit(X_train_car, y_train_car)

y_pred_car = lr_car.predict(X_test_car)
r2_car = r2_score(y_test_car, y_pred_car)
print("Linear Regression R² on test set:", r2_car)

Linear Regression R² on test set: 0.7718101387808747


## Q3 - Step 6: PCA Dimensionality Reduction and Train Linear Regression


In [15]:
# Apply PCA to retain 95% variance
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_car)
X_test_pca = pca.transform(X_test_car)

# Train Linear Regression on PCA features
lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train_car)

# Predict and evaluate
y_pred_pca = lr_pca.predict(X_test_pca)
r2_pca = r2_score(y_test_car, y_pred_pca)
print("Linear Regression R² on test set after PCA:", r2_pca)

Linear Regression R² on test set after PCA: 0.7296545680543669
