In [None]:
# Import all required libraries
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# create synthetic data

In [6]:
X, y, coef = make_regression(n_samples=500, n_features=8, n_informative=5,
                             noise=25, random_state=42, coef=True)
df = pd.DataFrame(X, columns=[f"x{i}" for i in range(X.shape[1])])
df["y"] = y
print("True informative coefficients (first 8):")
print(coef)
df.head()

True informative coefficients (first 8):
[97.246139    0.         51.80104314 61.41862443  0.          8.54030302
 48.50179875  0.        ]


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,y
0,-0.483061,-0.891192,0.146793,1.612221,-0.268531,0.895038,-1.581191,0.896839,-11.518604
1,-1.840078,-1.299216,-0.032281,0.640543,-0.113128,2.511557,0.34671,0.123078,-64.071427
2,-2.189922,1.074318,1.194592,0.958386,0.229075,-0.756764,0.232787,0.051661,-74.380493
3,-0.478837,0.617006,-0.862776,0.693479,1.059936,1.097153,-2.153343,-0.392013,-174.745102
4,0.568103,-1.123494,-1.049655,1.362563,3.152057,1.447306,0.506241,1.640615,133.700454


1. **n_samples**: 
    - The number of samples (rows) in the dataset.
    - Example: `n_samples=500` creates 500 data points.

2. **n_features**: 
    - The number of features (columns) in the dataset.
    - Example: `n_features=8` creates 8 features.

3. **n_informative**: 
    - The number of informative features that contribute to the target variable (`y`).
    - Example: `n_informative=5` means 5 features are used to compute `y`.

4. **noise**: 
    - The standard deviation of the Gaussian noise added to the target variable (`y`).
    - Example: `noise=25` adds noise to make the regression problem more challenging.

5. **random_state**: 
    - Controls the randomness of the dataset generation for reproducibility.
    - Example: `random_state=42` ensures the same dataset is generated every time.

6. **coef**: 
    - If `True`, the function returns the coefficients of the informative features.
    - Example: `coef=True` returns the coefficients alongside the dataset.

#### Returns:
1. **X**: 
    - A numpy array of shape `(n_samples, n_features)` containing the feature matrix.

2. **y**: 
    - A numpy array of shape `(n_samples,)` containing the target variable.

3. **coef** (optional): 
    - A numpy array of shape `(n_features,)` containing the coefficients of the informative features (if `coef=True`).


# splitting the dat

In [8]:
X_train, X_val, y_train, y_val = train_test_split(df.drop(columns="y"), df["y"],
                                                  test_size=0.25, random_state=42)

# Baseline: predict mean
y_pred_mean = np.full_like(y_val, y_train.mean())
mse_baseline = mean_squared_error(y_val, y_pred_mean)
print(f"Baseline MSE: {mse_baseline:.2f}")

Baseline MSE: 18899.50


In [11]:
# Shapes before splitting
print(f"Shape of X before splitting: {X.shape}")
print(f"Shape of y before splitting: {y.shape}")

# Shapes after splitting
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_val: {X_val.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_val: {y_val.shape}")

Shape of X before splitting: (500, 8)
Shape of y before splitting: (500,)
Shape of X_train: (375, 8)
Shape of X_val: (125, 8)
Shape of y_train: (375,)
Shape of y_val: (125,)


# OLS vs RidgeRegression

## build pipeline

In [13]:
ols = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LinearRegression())
])
ridge = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=10.0, random_state=42))
])

## fit the data

In [14]:
ols.fit(X_train, y_train)
ridge.fit(X_train, y_train)

## predict and evaluate

In [None]:
for name, model in [("OLS", ols), ("Ridge(α=10)", ridge)]:
    y_tr_pred = model.predict(X_train)
    y_va_pred = model.predict(X_val)
    print(f"{name} -> Train R²: {r2_score(y_train,y_tr_pred):.3f}  Val R²: {r2_score(y_val,y_va_pred):.3f}")

OLS -> Train R²: 0.967  Val R²: 0.969
Ridge(α=10) -> Train R²: 0.966  Val R²: 0.968
