In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.linear_model import LinearRegression


from sklearn.datasets import fetch_california_housing, load_diabetes, fetch_openml
from sklearn.model_selection import train_test_split


In [None]:
X, y = fetch_openml(name="california_housing", version=1, as_frame=True, return_X_y=True)

# California Housing dataset (1990 Census)
# Equivalent to sklearn.datasets.fetch_california_housing
# Loaded via OpenML and transformed due to HTTP 403 error in Colab

df = X.copy()
df["MedHouseVal"] = y

df["AveRooms"]  = df["total_rooms"] / df["households"]
df["AveBedrms"] = df["total_bedrooms"] / df["households"]
df["AveOccup"]  = df["population"] / df["households"]

# Selecting only necessary cols
df = df[["median_income", "housing_median_age", "AveRooms", "AveBedrms", "population", "AveOccup", "latitude", "longitude", "MedHouseVal"]]

In [None]:
df

Unnamed: 0,median_income,housing_median_age,AveRooms,AveBedrms,population,AveOccup,latitude,longitude,MedHouseVal
0,8.3252,41,6.984127,1.023810,322,2.555556,37.88,-122.23,452600
1,8.3014,21,6.238137,0.971880,2401,2.109842,37.86,-122.22,358500
2,7.2574,52,8.288136,1.073446,496,2.802260,37.85,-122.24,352100
3,5.6431,52,5.817352,1.073059,558,2.547945,37.85,-122.25,341300
4,3.8462,52,6.281853,1.081081,565,2.181467,37.85,-122.25,342200
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25,5.045455,1.133333,845,2.560606,39.48,-121.09,78100
20636,2.5568,18,6.114035,1.315789,356,3.122807,39.49,-121.21,77100
20637,1.7000,17,5.205543,1.120092,1007,2.325635,39.43,-121.22,92300
20638,1.8672,18,5.329513,1.171920,741,2.123209,39.43,-121.32,84700


In [None]:
df.dropna(inplace=True)

In [None]:
X = df.drop("MedHouseVal", axis=1)
y = df["MedHouseVal"]

In [None]:
#train test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
#linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)

train_pred = lr.predict(X_train)
test_pred = lr.predict(X_test)

print("Training MSE:", mean_squared_error(y_train, train_pred))
print("Test MSE:", mean_squared_error(y_test, test_pred))
print("Coefficients:", lr.coef_)


Training MSE: 5162710048.331072
Test MSE: 5645922500.077655
Coefficients: [ 4.44694449e+04  9.58332354e+02 -1.20980333e+04  7.78406984e+04
 -5.20319679e-01 -3.44070850e+02 -4.20293189e+04 -4.32443933e+04]


In [None]:
#ridge regression with gridsearchcv
ridge = Ridge()

params = {"alpha": [0.01, 0.1, 1, 10, 100]}

ridge_cv = GridSearchCV(
    ridge,
    params,
    cv=5,
    scoring="neg_mean_squared_error"
)

ridge_cv.fit(X_train, y_train)

best_ridge = ridge_cv.best_estimator_

print("Best alpha (Ridge):", ridge_cv.best_params_["alpha"])
print("Train MSE:", mean_squared_error(y_train, best_ridge.predict(X_train)))
print("Test MSE:", mean_squared_error(y_test, best_ridge.predict(X_test)))


Best alpha (Ridge): 0.01
Train MSE: 5162710048.420575
Test MSE: 5645913864.000843


In [None]:
#lasso regression with gridsearchcv
lasso = Lasso(max_iter=10000)

params = {"alpha": [0.001, 0.01, 0.1, 1, 10]}

lasso_cv = GridSearchCV(
    lasso,
    params,
    cv=5,
    scoring="neg_mean_squared_error"
)

lasso_cv.fit(X_train, y_train)

best_lasso = lasso_cv.best_estimator_

print("Best alpha (Lasso):", lasso_cv.best_params_["alpha"])
print("Train MSE:", mean_squared_error(y_train, best_lasso.predict(X_train)))
print("Test MSE:", mean_squared_error(y_test, best_lasso.predict(X_test)))


Best alpha (Lasso): 0.001
Train MSE: 5162710048.331128
Test MSE: 5645922299.485113


In [None]:
print("Ridge Coefficients:")
print(best_ridge.coef_)

print("\nLasso Coefficients:")
print(best_lasso.coef_)


Ridge Coefficients:
[ 4.44692826e+04  9.58334040e+02 -1.20977262e+04  7.78389796e+04
 -5.20315366e-01 -3.44070425e+02 -4.20293109e+04 -4.32443645e+04]

Lasso Coefficients:
[ 4.44694410e+04  9.58332412e+02 -1.20980258e+04  7.78406569e+04
 -5.20319487e-01 -3.44070834e+02 -4.20293176e+04 -4.32443914e+04]


CLASSIFICATION (BREAST CANCER)

In [None]:
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
#logistic regression
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train)

train_pred = log_reg.predict(X_train)
test_pred = log_reg.predict(X_test)

print("Training Accuracy:", accuracy_score(y_train, train_pred))
print("Test Accuracy:", accuracy_score(y_test, test_pred))
print("Coefficients:", log_reg.coef_)


Training Accuracy: 0.9582417582417583
Test Accuracy: 0.956140350877193
Coefficients: [[ 1.0274368   0.22145051 -0.36213488  0.0254667  -0.15623532 -0.23771256
  -0.53255786 -0.28369224 -0.22668189 -0.03649446 -0.09710208  1.3705667
  -0.18140942 -0.08719575 -0.02245523  0.04736092 -0.04294784 -0.03240188
  -0.03473732  0.01160522  0.11165329 -0.50887722 -0.01555395 -0.016857
  -0.30773117 -0.77270908 -1.42859535 -0.51092923 -0.74689363 -0.10094404]]


In [None]:
#logistic regression hyperparameter tuning
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"]
}

grid = GridSearchCV(
    LogisticRegression(max_iter=10000),
    param_grid,
    cv=5,
    scoring="accuracy"
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_

print("Best Parameters are:", grid.best_params_)
print("Test Accuracy:", accuracy_score(y_test, best_model.predict(X_test)))


Best Parameters are: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Test Accuracy: 0.9824561403508771


In [None]:
#l1 vs l2 logistic regression compare
log_l1 = LogisticRegression(
    C=grid.best_params_["C"],
    penalty="l1",
    solver="liblinear",
    max_iter=10000
)

log_l2 = LogisticRegression(
    C=grid.best_params_["C"],
    penalty="l2",
    solver="liblinear",
    max_iter=10000
)

log_l1.fit(X_train, y_train)
log_l2.fit(X_train, y_train)

print("L1 Train Accuracy:", accuracy_score(y_train, log_l1.predict(X_train)))
print("L1 Test Accuracy:", accuracy_score(y_test, log_l1.predict(X_test)))

print("L2 Train Accuracy:", accuracy_score(y_train, log_l2.predict(X_train)))
print("L2 Test Accuracy:", accuracy_score(y_test, log_l2.predict(X_test)))


L1 Train Accuracy: 0.989010989010989
L1 Test Accuracy: 0.9824561403508771
L2 Train Accuracy: 0.9692307692307692
L2 Test Accuracy: 0.956140350877193


In [None]:
#comparing coefficients
print("L1 Coefficients:")
print(log_l1.coef_)
print("\nL2 Coefficients:")
print(log_l2.coef_)



L1 Coefficients:
[[ 7.70243294e-01 -1.08635267e-01  1.02354746e-01 -2.83567013e-03
   0.00000000e+00  4.71692791e+01 -1.13884800e+01 -1.36601720e+02
   1.97449587e+01  0.00000000e+00  0.00000000e+00  1.72741076e+00
   0.00000000e+00 -1.97754037e-01  0.00000000e+00  0.00000000e+00
   4.99901466e+01  0.00000000e+00  1.85133706e+01  0.00000000e+00
   1.67887791e-01 -4.38501624e-01  5.30416456e-02 -2.03706039e-02
  -2.18085107e+01  8.05710976e+00 -1.47626505e+01 -2.51696219e+01
  -2.52434340e+01  0.00000000e+00]]

L2 Coefficients:
[[ 5.40272741  0.26573732 -0.52651203 -0.02095479 -2.29922151 -0.2169419
  -3.56980218 -5.0110607  -2.26418385  0.36762747 -0.58036525  3.84169255
  -0.63875637 -0.10714305 -0.40591246  3.57237002  4.36577514 -0.26636925
   0.39354765  0.62630698 -0.20718582 -0.68999884  0.17661962 -0.01813087
  -4.67716843 -0.0140846  -4.46493229 -7.61271125 -6.83571871  0.57463274]]
