In [37]:
# Libraries

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tools import add_constant

import warnings
warnings.filterwarnings("ignore")

In [10]:
df = pd.read_excel("/content/Exercise 11.4 MLR Data.xlsx")

In [20]:
target = "Strength"
X = df.drop(columns=[target])
y = df[target]

In [38]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Lasso with CV
lasso = LassoCV(cv=5, random_state=42, max_iter=10000)
lasso.fit(X_train_scaled, y_train)

# Get selected features
selected_cols = X_train.columns[(lasso.coef_ != 0)]
print("Selected features:", selected_cols.tolist())

# Filter original DataFrame
X_selected = df[selected_cols]

Selected features: ['X4', 'X5', 'X6', 'X7', 'X8', 'X10', 'X13', 'X15', 'X16', 'X17', 'X19', 'X20', 'X21', 'X22', 'X23', 'X25', 'X26', 'X29', 'X32', 'X35', 'X37', 'X41', 'X46', 'X47', 'X53', 'X59', 'X62', 'X63', 'X73']


In [46]:
# MLR with Validation (All features)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

X_train_const = sm.add_constant(X_train)
X_val_const = sm.add_constant(X_val)

model = sm.OLS(y_train, X_train_const).fit()
y_train_pred = model.predict(X_train_const)
y_val_pred = model.predict(X_val_const)

r2_train = r2_score(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
r2_val = r2_score(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))

print("Training R²:", round(r2_train, 4))
print("Training RMSE:", round(rmse_train, 4))
print("Validation R²:", round(r2_val, 4))
print("Validation RMSE:", round(rmse_val, 4))
print(model.summary())

Training R²: 0.8176
Training RMSE: 7.754
Validation R²: 0.7779
Validation RMSE: 8.7367
                            OLS Regression Results                            
Dep. Variable:               Strength   R-squared:                       0.818
Model:                            OLS   Adj. R-squared:                  0.802
Method:                 Least Squares   F-statistic:                     50.77
Date:                Thu, 26 Jun 2025   Prob (F-statistic):               0.00
Time:                        23:04:30   Log-Likelihood:                -4105.1
No. Observations:                1184   AIC:                             8404.
Df Residuals:                    1087   BIC:                             8897.
Df Model:                          96                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------

In [39]:
# MLR with Validation (Selected features)

X_train, X_val, y_train, y_val = train_test_split(X_selected, y, test_size=0.3, random_state=42)

X_train_const = sm.add_constant(X_train)
X_val_const = sm.add_constant(X_val)

model = sm.OLS(y_train, X_train_const).fit()
y_pred = model.predict(X_val_const)

r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print("Training R²:", round(r2_train, 4))
print("Training RMSE:", round(rmse_train, 4))
print("Validation R²:", round(r2_val, 4))
print("Validation RMSE:", round(rmse_val, 4))
print(model.summary())

Validation R²: 0.770105657657624
Validation RMSE: 8.889384465905565
                            OLS Regression Results                            
Dep. Variable:               Strength   R-squared:                       0.779
Model:                            OLS   Adj. R-squared:                  0.774
Method:                 Least Squares   F-statistic:                     140.6
Date:                Thu, 26 Jun 2025   Prob (F-statistic):               0.00
Time:                        22:53:00   Log-Likelihood:                -4217.7
No. Observations:                1184   AIC:                             8495.
Df Residuals:                    1154   BIC:                             8648.
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------

In [30]:
# MLR without Validation

X1 = add_constant(X_selected)
model = sm.OLS(y, X1).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               Strength   R-squared:                       0.782
Model:                            OLS   Adj. R-squared:                  0.777
Method:                 Least Squares   F-statistic:                     160.4
Date:                Thu, 26 Jun 2025   Prob (F-statistic):               0.00
Time:                        22:47:57   Log-Likelihood:                -6029.0
No. Observations:                1692   AIC:                         1.213e+04
Df Residuals:                    1654   BIC:                         1.234e+04
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        181.0979     27.459      6.595      0.0

In [45]:
# Ridge
from sklearn.linear_model import RidgeCV

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

ridge = RidgeCV(alphas=np.logspace(-3, 3, 20), cv=5)
ridge.fit(X_train, y_train)

y_train_pred = ridge.predict(X_train)
y_val_pred = ridge.predict(X_val)

r2_train = r2_score(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

r2_val = r2_score(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))

print("Ridge Regression Results:")
print("Best Alpha:", ridge.alpha_)
print("Training R²:", round(r2_train, 4))
print("Training RMSE:", round(rmse_train, 4))
print("Validation R²:", round(r2_val, 4))
print("Validation RMSE:", round(rmse_val, 4))

Ridge Regression Results:
Best Alpha: 0.0379269019073225
Training R²: 0.8164
Training RMSE: 7.7795
Validation R²: 0.7827
Validation RMSE: 8.6426


In [42]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(max_depth=5, random_state=42)
tree.fit(X_train, y_train)

# Predict
y_train_pred = tree.predict(X_train)
y_val_pred = tree.predict(X_val)

# Evaluate
r2_train = r2_score(y_train, y_train_pred)
r2_val = r2_score(y_val, y_val_pred)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))

print("Training R²:", round(r2_train, 4))
print("Training RMSE:", round(rmse_train, 4))
print("Validation R²:", round(r2_val, 4))
print("Validation RMSE:", round(rmse_val, 4))

Training R²: 0.782
Training RMSE: 8.477
Validation R²: 0.7041
Validation RMSE: 10.0856
