In [3]:
#Que 1

import numpy as np
import pandas as pd

# Step 1: Set a random seed for reproducibility
np.random.seed(42)

# Step 2: Define parameters
n_samples = 500      # number of data points
n_features = 7       # number of features

# Step 3: Create a covariance matrix to make features correlated
base_corr = 0.9
cov = np.full((n_features, n_features), base_corr)
np.fill_diagonal(cov, 1.0)  # diagonal = 1 (self-correlation)

# Step 4: Generate multivariate normal data (highly correlated)
mean = np.zeros(n_features)
X = np.random.multivariate_normal(mean, cov, size=n_samples)

# Step 5: Define true coefficients and bias
true_weights = np.array([2.5, -1.8, 1.2, 0.8, 0.5, 1.5, -0.7])
bias = 3.0

# Step 6: Generate target variable with some noise They define how the target variable (y) 
#is generated from your features (X) using a linear relationship + randomness
noise = np.random.normal(0, 1.5, size=n_samples)
y = X.dot(true_weights) + bias + noise

# Step 7: Create DataFrame
columns = [f'Feature_{i+1}' for i in range(n_features)]
df = pd.DataFrame(X, columns=columns)
df['Target'] = y

# Step 8: Display correlation matrix
print("Feature Correlation Matrix:")
print(df.corr())

# Step 9: Save dataset
df.to_csv("highly_correlated_dataset.csv", index=False)
print("\nDataset saved as 'highly_correlated_dataset.csv'")

Feature Correlation Matrix:
           Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
Feature_1   1.000000   0.898282   0.898364   0.896560   0.903394   0.897403   
Feature_2   0.898282   1.000000   0.900876   0.894252   0.905883   0.893929   
Feature_3   0.898364   0.900876   1.000000   0.906104   0.898374   0.896277   
Feature_4   0.896560   0.894252   0.906104   1.000000   0.899405   0.903239   
Feature_5   0.903394   0.905883   0.898374   0.899405   1.000000   0.917638   
Feature_6   0.897403   0.893929   0.896277   0.903239   0.917638   1.000000   
Feature_7   0.890928   0.901957   0.897089   0.904769   0.910133   0.902587   
Target      0.898816   0.790093   0.865008   0.863328   0.856112   0.875903   

           Feature_7    Target  
Feature_1   0.890928  0.898816  
Feature_2   0.901957  0.790093  
Feature_3   0.897089  0.865008  
Feature_4   0.904769  0.863328  
Feature_5   0.910133  0.856112  
Feature_6   0.902587  0.875903  
Feature_7   1.000000  0.81345

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Step 10: Load the dataset you just saved
df = pd.read_csv("highly_correlated_dataset.csv")
X = df.drop("Target", axis=1).values
y = df["Target"].values

# Step 11: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 12: Feature scaling (important for gradient descent stability)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 13: Add bias column
def add_bias(X):
    return np.hstack([np.ones((X.shape[0], 1)), X])

X_train_b = add_bias(X_train)
X_test_b = add_bias(X_test)

# Step 14: Ridge cost function
def ridge_cost(X, y, w, lam):
    n = X.shape[0]
    residuals = X @ w - y
    mse = (residuals @ residuals) / (2 * n)
    reg = (lam / (2 * n)) * np.sum(w[1:] ** 2)  # exclude bias from regularization
    return mse + reg

# Step 15: Gradient function
def ridge_grad(X, y, w, lam):
    n = X.shape[0]
    residuals = X @ w - y
    grad = (X.T @ residuals) / n
    grad[1:] += (lam / n) * w[1:]  # exclude bias term
    return grad

# Step 16: Gradient Descent implementation
def ridge_gd(X, y, alpha=0.001, lam=1.0, iters=5000, tol=1e-8):
    w = np.zeros(X.shape[1])
    history = []
    for t in range(iters):
        cost = ridge_cost(X, y, w, lam)
        history.append(cost)
        grad = ridge_grad(X, y, w, lam)
        w -= alpha * grad
        if t > 0 and abs(history[-2] - history[-1]) < tol:
            break
    return w, history

# Step 17: Hyperparameter sweep
learning_rates = [0.0001, 0.001, 0.01, 0.1]  # safer set
lambdas = [1e-15, 1e-10, 1e-5, 1e-3, 0, 1, 10, 20]

best_result = None
results = []

for alpha in learning_rates:
    for lam in lambdas:
        w, hist = ridge_gd(X_train_b, y_train, alpha=alpha, lam=lam, iters=10000)
        y_pred_train = X_train_b @ w
        y_pred_test = X_test_b @ w
        r2_train = r2_score(y_train, y_pred_train)
        r2_test = r2_score(y_test, y_pred_test)
        final_cost = ridge_cost(X_train_b, y_train, w, lam)
        results.append((alpha, lam, final_cost, r2_train, r2_test))
        if best_result is None or r2_test > best_result[4]:
            best_result = (alpha, lam, final_cost, r2_train, r2_test)

# Step 18: Print best parameters
print("\nBest Parameters:")
print(f"Learning Rate: {best_result[0]}")
print(f"Lambda: {best_result[1]}")
print(f"Final Cost: {best_result[2]:.4f}")
print(f"Train R2: {best_result[3]:.4f}")
print(f"Test R2: {best_result[4]:.4f}")


Best Parameters:
Learning Rate: 0.01
Lambda: 1
Final Cost: 1.2196
Train R2: 0.8704
Test R2: 0.8766


In [5]:
#Que 2

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error

# Step 1: Load dataset
df = pd.read_csv("Hitters (1).csv")

# Step 2: Handle missing values
df = df.dropna()   # simplest approach: drop rows with nulls
# Alternatively: df.fillna(df.mean(), inplace=True) for numeric columns

# Step 3: Separate features and target
X = df.drop("Salary", axis=1)   # assuming 'Salary' is the target column
y = df["Salary"]

# Step 4: Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

# Step 5: Preprocessing (encoding + scaling)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(drop="first"), categorical_cols)
    ]
)

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Define models
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=0.5748),
    "Lasso": Lasso(alpha=0.5748)
}

# Step 8: Train and evaluate
results = {}
for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("regressor", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = {"R2": r2, "MSE": mse}

# Step 9: Display results
print("\nModel Performance on Test Set:")
for name, metrics in results.items():
    print(f"{name}: R2={metrics['R2']:.4f}, MSE={metrics['MSE']:.2f}")


Model Performance on Test Set:
Linear: R2=0.2907, MSE=128284.35
Ridge: R2=0.3007, MSE=126484.39
Lasso: R2=0.3006, MSE=126504.31


  model = cd_fast.enet_coordinate_descent(
