In [1]:
# Q1 - Ridge Regression using Gradient Descent

import numpy as np
from sklearn.metrics import r2_score

# Generate correlated dataset
np.random.seed(42)
X = np.random.rand(100, 7)
X[:, 1] = X[:, 0] + np.random.normal(0, 0.02, 100)
X[:, 2] = X[:, 0] + np.random.normal(0, 0.03, 100)
y = 3*X[:,0] + 2*X[:,1] + np.random.normal(0, 0.1, 100)

# Scale features for stable gradient descent
X = (X - X.mean(axis=0)) / X.std(axis=0)
y = (y - y.mean()) / y.std()

# Ridge Regression using Gradient Descent
def ridge_regression(X, y, lr, lam, epochs=1000):
    n, m = X.shape
    X_b = np.c_[np.ones((n,1)), X]
    theta = np.zeros((m+1,1))
    y = y.reshape(-1,1)
    for _ in range(epochs):
        gradients = (1/n)*X_b.T.dot(X_b.dot(theta)-y) + (lam/n)*np.r_[[[0]],theta[1:]]
        theta -= lr * gradients
        theta = np.clip(theta, -1e5, 1e5)
    y_pred = X_b.dot(theta)
    cost = np.mean((y_pred - y)**2) + lam*np.sum(theta[1:]**2)
    return theta, cost, r2_score(y, y_pred)

learning_rates = [0.0001, 0.001, 0.01, 0.1]
lambdas = [1e-5, 1e-3, 0, 1, 10]

best = (None, float('inf'), -1)
for lr in learning_rates:
    for lam in lambdas:
        theta, cost, r2 = ridge_regression(X, y, lr, lam)
        if np.isfinite(cost) and np.isfinite(r2):
            if cost < best[1] and r2 > best[2]:
                best = (f"lr={lr}, lam={lam}", cost, r2)

print("# Q1 Result:")
print("Best Params:", best[0])
print("Min Cost:", best[1])
print("Max R2:", best[2])

# Q1 Result:
Best Params: lr=0.1, lam=0
Min Cost: 0.004108117326630319
Max R2: 0.9958918826733697


In [None]:
# Q2 - Linear, Ridge, and Lasso Regression on Hitters Dataset

# (a) Pre-process the data (null values, noise, categorical to numerical encoding)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error

# Load the Hitters dataset from Google Drive
# (Note: The dataset link provided in the assignment is no longer accessible)

url = "https://drive.google.com/uc?id=1qzCKF6JKKMB0p7ul_lLy8tdmRk3vE_bG"
data = pd.read_csv(url)

# (a) Handle missing values and convert categorical to numeric
data = data.dropna()
for col in data.select_dtypes(include='object'):
    data[col] = LabelEncoder().fit_transform(data[col])

# (b) Separate input and output features and perform scaling
X = data.drop('Salary', axis=1)
y = data['Salary']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# (c) Fit Linear, Ridge, and Lasso regression models
# Regularization parameter (alpha) = 0.5748
linear_model = LinearRegression()
ridge_model = Ridge(alpha=0.5748)
lasso_model = Lasso(alpha=0.5748)

linear_model.fit(X_train, y_train)
ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)

# (d) Evaluate model performance
linear_pred = linear_model.predict(X_test)
ridge_pred = ridge_model.predict(X_test)
lasso_pred = lasso_model.predict(X_test)

linear_r2 = r2_score(y_test, linear_pred)
ridge_r2 = r2_score(y_test, ridge_pred)
lasso_r2 = r2_score(y_test, lasso_pred)

linear_mse = mean_squared_error(y_test, linear_pred)
ridge_mse = mean_squared_error(y_test, ridge_pred)
lasso_mse = mean_squared_error(y_test, lasso_pred)

# Print the results (you don't need to run this cell)
print("# Q2 Results:")
print("Linear Regression -> R2:", linear_r2, " MSE:", linear_mse)
print("Ridge Regression  -> R2:", ridge_r2, " MSE:", ridge_mse)
print("Lasso Regression  -> R2:", lasso_r2, " MSE:", lasso_mse)

In [2]:
# Q3 - Cross Validation for Ridge and Lasso Regression

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd

# Load California Housing dataset (modern replacement for Boston)
data = fetch_california_housing()
X, y = data.data, data.target

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# RidgeCV
ridgecv = RidgeCV(alphas=[0.01, 0.1, 1, 10, 100], cv=5)
ridgecv.fit(X_train, y_train)
ridge_y_pred = ridgecv.predict(X_test)

# LassoCV
lassocv = LassoCV(alphas=[0.01, 0.1, 1, 10, 100], cv=5, max_iter=10000)
lassocv.fit(X_train, y_train)
lasso_y_pred = lassocv.predict(X_test)

# Evaluation
results = {
    "Model": ["RidgeCV", "LassoCV"],
    "Best Alpha": [ridgecv.alpha_, lassocv.alpha_],
    "R2 Score": [r2_score(y_test, ridge_y_pred), r2_score(y_test, lasso_y_pred)],
    "MSE": [mean_squared_error(y_test, ridge_y_pred), mean_squared_error(y_test, lasso_y_pred)]
}

print("# Q3 Results:")
print(pd.DataFrame(results))

# Q3 Results:
     Model  Best Alpha  R2 Score       MSE
0  RidgeCV        0.01  0.575788  0.555891
1  LassoCV        0.01  0.581615  0.548255


In [3]:
# Q4 - Multiclass Logistic Regression using One-vs-Rest

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train One-vs-Rest Logistic Regression
model = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate
print("# Q4 Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=iris.target_names))

# Q4 Results:
Accuracy: 0.9666666666666667

Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.89      0.94         9
   virginica       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



