In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tools import add_constant

In [2]:
df = pd.read_excel("/Exercise 9.3 MLR Data.xlsx")

In [11]:
target = "Y"
X = df.drop(columns=[target])
y = df[target]

In [17]:
selected_features = ['x1', 'x8', 'x10', 'x12', 'x13', 'x16', 'x18', 'x23']
X_selected = df[selected_features]
y = df['Y']

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X_selected, y, test_size=0.3, random_state=42)

X_train_const = sm.add_constant(X_train)
X_val_const = sm.add_constant(X_val)

model = sm.OLS(y_train, X_train_const).fit()
y_pred = model.predict(X_val_const)

r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print("Validation R²:", r2)
print("Validation RMSE:", rmse)
print(model.summary())

Validation R²: 0.45699264483834157
Validation RMSE: 10.353680603123577
                            OLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.398
Model:                            OLS   Adj. R-squared:                  0.334
Method:                 Least Squares   F-statistic:                     6.204
Date:                Thu, 26 Jun 2025   Prob (F-statistic):           3.92e-06
Time:                        22:26:24   Log-Likelihood:                -318.02
No. Observations:                  84   AIC:                             654.0
Df Residuals:                      75   BIC:                             675.9
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------

In [21]:
X1 = add_constant(X_selected)
model = sm.OLS(y, X1).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.439
Model:                            OLS   Adj. R-squared:                  0.399
Method:                 Least Squares   F-statistic:                     10.86
Date:                Thu, 26 Jun 2025   Prob (F-statistic):           3.32e-11
Time:                        22:26:37   Log-Likelihood:                -450.93
No. Observations:                 120   AIC:                             919.9
Df Residuals:                     111   BIC:                             945.0
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        115.9309     60.920      1.903      0.0

In [19]:
# Lasso for feature selections
from sklearn.linear_model import LassoCV
lasso = LassoCV(cv=5).fit(X_train, y_train)
selected = X_train.columns[(lasso.coef_ != 0)]
print("Selected features:", selected.tolist())

Selected features: ['x1', 'x8', 'x10', 'x12', 'x13', 'x16', 'x18', 'x23']


In [25]:
# Ridge
from sklearn.linear_model import RidgeCV
ridge = RidgeCV(alphas=np.logspace(-3, 3, 20), cv=5)
ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_val)

r2_val = r2_score(y_val, y_pred_ridge)
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_ridge))

print("Ridge Regression Results:")
print("Best Alpha:", ridge.alpha_)
print("Validation R²:", round(r2_val, 4))
print("Validation RMSE:", round(rmse_val, 4))

Ridge Regression Results:
Best Alpha: 1000.0
Validation R²: 0.4314
Validation RMSE: 10.5948
