<a href="https://colab.research.google.com/github/Gurjot-Singh-2002/UML501-Lab-Assignments/blob/main/Assignment%204/102203582_3CO14_ODD2024_A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Q1

In [27]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [28]:
# 1. Generate the dataset
np.random.seed(42)
X, y = make_regression(n_samples=100, n_features=7, noise=0.1, random_state=42)

In [29]:
# Introduce high correlation between features
X[:, 1] = X[:, 0] * 0.5 + np.random.normal(0, 0.1, X[:, 1].shape[0])
X[:, 2] = X[:, 0] * 0.3 + np.random.normal(0, 0.1, X[:, 2].shape[0])
X[:, 3] = X[:, 1] * 0.5 + np.random.normal(0, 0.1, X[:, 3].shape[0])
X[:, 4] = X[:, 2] * 0.4 + np.random.normal(0, 0.1, X[:, 4].shape[0])
X[:, 5] = X[:, 3] * 0.2 + np.random.normal(0, 0.1, X[:, 5].shape[0])
X[:, 6] = X[:, 0] * 0.4 + np.random.normal(0, 0.1, X[:, 6].shape[0])


In [30]:
# Check if any NaNs are in the dataset
if np.any(np.isnan(X)) or np.any(np.isnan(y)):
    raise ValueError("Dataset contains NaN values. Please check the data.")

In [31]:
# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# 2. Ridge Regression using Gradient Descent
def ridge_gradient_descent(X, y, lambda_reg=0.1, learning_rate=0.01, iterations=1000):
    m, n = X.shape
    X = np.c_[np.ones((m, 1)), X]  # Add bias term (x0 = 1)
    theta = np.zeros((n+1, 1))  # Initialize theta (including bias term)
    y = y.reshape(-1, 1)  # Convert y to column vector

    for i in range(iterations):
        y_pred = X.dot(theta)  # Predicted values
        gradient = (1/m) * X.T.dot(y_pred - y) + (lambda_reg/m) * np.r_[[[0]], theta[1:]]  # Ridge gradient
        theta -= learning_rate * gradient  # Update theta

        # Preventing NaNs due to large updates
        if np.any(np.isnan(theta)):
            raise ValueError(f"NaN encountered in theta at iteration {i}. Likely instability due to high learning rate or lambda.")

    return theta

In [37]:
# 3. Evaluate the model using R² Score and Cost Function
def evaluate_model(X_train, y_train, X_test, y_test, lambda_reg, learning_rate, iterations=1000):
    theta = ridge_gradient_descent(X_train, y_train, lambda_reg, learning_rate, iterations)

    # Adding bias term for predictions
    X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

    y_train_pred = X_train_bias.dot(theta)
    y_test_pred = X_test_bias.dot(theta)

    # R² score
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)

    # Cost function (MSE + regularization term)
    cost = (1/(2*X_train.shape[0])) * np.sum((y_train_pred - y_train) ** 2) + (lambda_reg/(2*X_train.shape[0])) * np.sum(theta[1:] ** 2)

    return r2_train, r2_test, cost


In [39]:
# 4. Experiment with different hyperparameters
learning_rates = [0.0001, 0.001, 0.01, 0.1, 1]
lambda_values = [1e-15, 1e-10, 1e-5, 1e-3, 0, 1, 10, 20]

best_r2_train = -np.inf
best_r2_test = -np.inf
best_cost = np.inf
best_params = {}

for lr in learning_rates:
    for lambda_reg in lambda_values:
        try:
            r2_train, r2_test, cost = evaluate_model(X_train, y_train, X_test, y_test, lambda_reg, lr)

            if cost < best_cost:
                best_r2_train = r2_train
                best_r2_test = r2_test
                best_cost = cost
                best_params = {'learning_rate': lr, 'lambda_reg': lambda_reg}
        except ValueError as e:
            print(f"Error for lr={lr} and lambda={lambda_reg}: {e}")
            continue

# Output the best parameters
print("Best parameters found:")
print(f"Learning rate: {best_params['learning_rate']}")
print(f"Regularization parameter (lambda): {best_params['lambda_reg']}")
print(f"R^2 on training data: {best_r2_train}")
print(f"R^2 on test data: {best_r2_test}")
print(f"Cost function value: {best_cost}")

Best parameters found:
Learning rate: 0.0001
Regularization parameter (lambda): 20
R^2 on training data: 0.0972728687301243
R^2 on test data: 0.04966834120804697
Cost function value: 602408.559835469


#Q2

Part (a)

In [43]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import requests
from io import BytesIO
import zipfile


In [44]:
# Step 2: Load the dataset
hitters_df = pd.read_csv('Hitters.csv')

In [45]:
# Step 3: Inspect the dataset
print(hitters_df.head())
print(hitters_df.info())


   AtBat  Hits  HmRun  Runs  RBI  Walks  Years  CAtBat  CHits  CHmRun  CRuns  \
0    293    66      1    30   29     14      1     293     66       1     30   
1    315    81      7    24   38     39     14    3449    835      69    321   
2    479   130     18    66   72     76      3    1624    457      63    224   
3    496   141     20    65   78     37     11    5628   1575     225    828   
4    321    87     10    39   42     30      2     396    101      12     48   

   CRBI  CWalks League Division  PutOuts  Assists  Errors  Salary NewLeague  
0    29      14      A        E      446       33      20     NaN         A  
1   414     375      N        W      632       43      10   475.0         N  
2   266     263      A        W      880       82      14   480.0         A  
3   838     354      N        E      200       11       3   500.0         N  
4    46      33      N        E      805       40       4    91.5         N  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3

In [46]:
# Step 4: Handle missing values
# Check for missing values in each column
print(hitters_df.isnull().sum())

AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64


In [49]:
#  Remove missing values rows
hitters_df.dropna(inplace=True)

In [51]:
# Step 5: Handle noise
hitters_df = hitters_df[hitters_df['Salary'] < 20000]

In [52]:
# Step 6: Categorical to numerical encoding (One-Hot Encoding)
# List the categorical columns (usually those with object type)
categorical_cols = hitters_df.select_dtypes(include=['object']).columns
print(f"Categorical columns: {categorical_cols}")

Categorical columns: Index(['League', 'Division', 'NewLeague'], dtype='object')


In [53]:
# Apply One-Hot Encoding for categorical columns
hitters_df = pd.get_dummies(hitters_df, columns=categorical_cols, drop_first=True)

In [55]:
# Step 7: Check the final pre-processed data
print(hitters_df.head())
print(hitters_df.info())

   AtBat  Hits  HmRun  Runs  RBI  Walks  Years  CAtBat  CHits  CHmRun  CRuns  \
1    315    81      7    24   38     39     14    3449    835      69    321   
2    479   130     18    66   72     76      3    1624    457      63    224   
3    496   141     20    65   78     37     11    5628   1575     225    828   
4    321    87     10    39   42     30      2     396    101      12     48   
5    594   169      4    74   51     35     11    4408   1133      19    501   

   CRBI  CWalks  PutOuts  Assists  Errors  Salary  League_N  Division_W  \
1   414     375      632       43      10   475.0      True        True   
2   266     263      880       82      14   480.0     False        True   
3   838     354      200       11       3   500.0      True       False   
4    46      33      805       40       4    91.5      True       False   
5   336     194      282      421      25   750.0     False        True   

   NewLeague_N  
1         True  
2        False  
3         True  


Part (b)

In [56]:
from sklearn.preprocessing import StandardScaler


In [57]:
# Step 1: Separate the input (X) and output (y) features
# The target variable is typically 'Salary', so we separate it
X = hitters_df.drop(columns=['Salary'])  # Input features
y = hitters_df['Salary']  # Output feature (target)

In [58]:
# Step 2: Perform Scaling on input features (Standard Scaling)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # This will scale the input features

In [59]:
# Convert scaled data back to a DataFrame for easier inspection
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

In [61]:
# Step 3: Check the scaled input features
print(X_scaled_df.head())

      AtBat      Hits     HmRun      Runs       RBI     Walks     Years  \
0 -0.602900 -0.595675 -0.528551 -1.206112 -0.522063 -0.097527  1.397893   
1  0.512542  0.492260  0.729966  0.441515  0.794060  1.609373 -0.901200   
2  0.628167  0.736490  0.958788  0.402286  1.026317 -0.189792  0.770868   
3 -0.562092 -0.462459 -0.185319 -0.617673 -0.367225 -0.512719 -1.110209   
4  1.294712  1.358167 -0.871783  0.755349 -0.018840 -0.282057  0.770868   

     CAtBat     CHits    CHmRun     CRuns      CRBI    CWalks   PutOuts  \
0  0.346791  0.174373 -0.002920 -0.121671  0.258966  0.435334  1.221499   
1 -0.452865 -0.409892 -0.076054 -0.415105 -0.199590  0.010373  2.109109   
2  1.301558  1.318174  1.898565  1.412051  1.572666  0.355654 -0.324661   
3 -0.990935 -0.960153 -0.697693 -0.947521 -0.881228 -0.862315  1.840678   
4  0.766993  0.634985 -0.612370  0.422846  0.017294 -0.251434 -0.031177   

    Assists    Errors  League_N  Division_W  NewLeague_N  
0 -0.523191  0.213352  1.058758    0.98

Part (c)

In [62]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [63]:
# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)

In [64]:
# Step 2: Initialize the models
linear_model = LinearRegression()
ridge_model = Ridge(alpha=0.5748)
lasso_model = Lasso(alpha=0.5748)

In [65]:
# Step 3: Fit the models on the training data
linear_model.fit(X_train, y_train)
ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [66]:
# Step 4: Predict using the trained models
y_pred_linear = linear_model.predict(X_test)
y_pred_ridge = ridge_model.predict(X_test)
y_pred_lasso = lasso_model.predict(X_test)

In [68]:
# Step 5: Calculate R² and MSE for each model
r2_linear = r2_score(y_test, y_pred_linear)
r2_ridge = r2_score(y_test, y_pred_ridge)
r2_lasso = r2_score(y_test, y_pred_lasso)

mse_linear = mean_squared_error(y_test, y_pred_linear)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

In [69]:
# Step 6: Display the results
print("Model Performance Comparison:")
print(f"Linear Regression: R² = {r2_linear:.4f}, MSE = {mse_linear:.4f}")
print(f"Ridge Regression (alpha=0.5748): R² = {r2_ridge:.4f}, MSE = {mse_ridge:.4f}")
print(f"Lasso Regression (alpha=0.5748): R² = {r2_lasso:.4f}, MSE = {mse_lasso:.4f}")

Model Performance Comparison:
Linear Regression: R² = 0.2907, MSE = 128284.3455
Ridge Regression (alpha=0.5748): R² = 0.2998, MSE = 126648.5942
Lasso Regression (alpha=0.5748): R² = 0.2991, MSE = 126779.4664


Part (d)

In [70]:
# Step 1: Evaluate the performance of each trained model on the test set
r2_linear_test = r2_score(y_test, y_pred_linear)
r2_ridge_test = r2_score(y_test, y_pred_ridge)
r2_lasso_test = r2_score(y_test, y_pred_lasso)

mse_linear_test = mean_squared_error(y_test, y_pred_linear)
mse_ridge_test = mean_squared_error(y_test, y_pred_ridge)
mse_lasso_test = mean_squared_error(y_test, y_pred_lasso)

In [72]:
# Step 2: Print the performance metrics for the test set
print("Test Set Performance Comparison:")
print(f"Linear Regression: R² = {r2_linear_test:.4f}, MSE = {mse_linear_test:.4f}")
print(f"Ridge Regression (alpha=0.5748): R² = {r2_ridge_test:.4f}, MSE = {mse_ridge_test:.4f}")
print(f"Lasso Regression (alpha=0.5748): R² = {r2_lasso_test:.4f}, MSE = {mse_lasso_test:.4f}")

Test Set Performance Comparison:
Linear Regression: R² = 0.2907, MSE = 128284.3455
Ridge Regression (alpha=0.5748): R² = 0.2998, MSE = 126648.5942
Lasso Regression (alpha=0.5748): R² = 0.2991, MSE = 126779.4664


In [73]:
# Step 3: Determine which model performs the best based on R² and MSE
performance_comparison = {
    "Linear Regression": {"R²": r2_linear_test, "MSE": mse_linear_test},
    "Ridge Regression": {"R²": r2_ridge_test, "MSE": mse_ridge_test},
    "Lasso Regression": {"R²": r2_lasso_test, "MSE": mse_lasso_test}
}

In [74]:
best_model = min(performance_comparison, key=lambda model: performance_comparison[model]["MSE"])
best_model_r2 = max(performance_comparison, key=lambda model: performance_comparison[model]["R²"])

In [75]:
print("\nBest model based on R²:", best_model_r2)
print("Best model based on MSE:", best_model)


Best model based on R²: Ridge Regression
Best model based on MSE: Ridge Regression


#Q3

In [77]:
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [80]:
# Step 2: Load the dataset
boston_df = pd.read_csv('Boston_Housing.csv')

In [89]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [81]:
# Step 2: Separate input features and target variable
X = boston_df.iloc[:, :-1].values  # All columns except the last one (input features)
y = boston_df.iloc[:, -1].values   # Last column (target variable - house prices)

In [82]:
# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [83]:
# Step 4: Perform scaling of the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [84]:
# Step 5: Implement RidgeCV (Ridge Regression with Cross-Validation)
ridge_model = RidgeCV(alphas=np.logspace(-6, 6, 13), store_cv_values=True)  # Log scale for alpha values
ridge_model.fit(X_train_scaled, y_train)



In [85]:
# Step 6: Implement LassoCV (Lasso Regression with Cross-Validation)
lasso_model = LassoCV(alphas=np.logspace(-6, 6, 13), cv=10)  # 10-fold cross-validation
lasso_model.fit(X_train_scaled, y_train)

In [86]:
# Step 7: Predict using the trained models
y_pred_ridge = ridge_model.predict(X_test_scaled)
y_pred_lasso = lasso_model.predict(X_test_scaled)

In [87]:
# Step 8: Calculate performance metrics (R² and MSE)
r2_ridge = r2_score(y_test, y_pred_ridge)
r2_lasso = r2_score(y_test, y_pred_lasso)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

In [88]:
# Step 9: Print the results
print("RidgeCV Model Performance:")
print(f"Best alpha (Ridge): {ridge_model.alpha_}")
print(f"R² (Ridge): {r2_ridge:.4f}")
print(f"MSE (Ridge): {mse_ridge:.4f}")

print("\nLassoCV Model Performance:")
print(f"Best alpha (Lasso): {lasso_model.alpha_}")
print(f"R² (Lasso): {r2_lasso:.4f}")
print(f"MSE (Lasso): {mse_lasso:.4f}")

RidgeCV Model Performance:
Best alpha (Ridge): 10.0
R² (Ridge): 0.6660
MSE (Ridge): 24.4958

LassoCV Model Performance:
Best alpha (Lasso): 0.01
R² (Lasso): 0.6682
MSE (Lasso): 24.3335
