In [31]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_absolute_error, mean_squared_error ,mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import get_scorer_names
from tabulate import tabulate


Utilize the diabetes dataset from lab 4.

In [32]:
# Load the diabetes dataset
X, y = datasets.load_diabetes(as_frame=True, scaled=False, return_X_y=True)

Perform cross-validation on nine polynomial models, ranging from degree 0 to 8

In [33]:
d = np.arange(0, 9)
cvd = []
cv_results = []
for degree in range(9):
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression())    

    scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    mean_r2 = scores.mean()
    std_r2 = scores.std()
    mae = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
    mean_mae = mae.mean()
    std_mae = mae.std()
    cvd.append((degree, mean_r2, std_r2, mean_mae, std_mae))
    print(cvd)

[(0, -0.02750604135376733, 0.03677220036298531, 66.04562360767684, 3.474660324845233)]
[(0, -0.02750604135376733, 0.03677220036298531, 66.04562360767684, 3.474660324845233), (1, 0.4823164359086422, 0.049268577511903694, 44.276499233214956, 2.1001103035211095)]
[(0, -0.02750604135376733, 0.03677220036298531, 66.04562360767684, 3.474660324845233), (1, 0.4823164359086422, 0.049268577511903694, 44.276499233214956, 2.1001103035211095), (2, -0.0978150645318944, 0.4232014428635416, 58.6074164272149, 11.75494973474989)]
[(0, -0.02750604135376733, 0.03677220036298531, 66.04562360767684, 3.474660324845233), (1, 0.4823164359086422, 0.049268577511903694, 44.276499233214956, 2.1001103035211095), (2, -0.0978150645318944, 0.4232014428635416, 58.6074164272149, 11.75494973474989), (3, -199.79691539603033, 226.9769925116093, 330.4327451894503, 147.82067659449095)]
[(0, -0.02750604135376733, 0.03677220036298531, 66.04562360767684, 3.474660324845233), (1, 0.4823164359086422, 0.049268577511903694, 44.27649

Construct a table summarizing the cross-validation results. Each model should have a separate row in the table. Include the R-Squared, Mean Absolute Error (MAE) and MAPE metrics for each model. Calculate the mean value and standard deviation of these metrics from the cross-validation. Include both values. (2 points)

In [44]:
final = pd.DataFrame(cvd)
final.columns = ['degree', 'mean_R_squared', 'std_R_squared', 'mean_MAE', 'std_MAE']
print(tabulate(final, headers=final.columns,tablefmt='psql'))

+----+----------+------------------+-----------------+------------+-----------+
|    |   degree |   mean_R_squared |   std_R_squared |   mean_MAE |   std_MAE |
|----+----------+------------------+-----------------+------------+-----------|
|  0 |        0 |       -0.027506  |       0.0367722 |    66.0456 |   3.47466 |
|  1 |        1 |        0.482316  |       0.0492686 |    44.2765 |   2.10011 |
|  2 |        2 |       -0.0978151 |       0.423201  |    58.6074 |  11.7549  |
|  3 |        3 |     -199.797     |     226.977     |   330.433  | 147.821   |
|  4 |        4 |     -571.083     |     369.892     |   657.26   | 159.476   |
|  5 |        5 |     -436.857     |     379.1       |   562.994  |  59.9172  |
|  6 |        6 |    -1695.48      |    2632.42      |   742.561  | 191.043   |
|  7 |        7 |    -5530.89      |    9518.59      |  1032.68   | 393.44    |
|  8 |        8 |   -16076.3       |   28050         |  1475.66   | 706.28    |
+----+----------+------------------+----

Identification of the Best Model: Identify the model that exhibits the highest performance based on the R-Squared and MAE metrics. Provide an explanation for choosing this specific model. (2 points)

In [47]:
r2_index = np.argmax(final.iloc[:, 1])
mae_index = np.argmin(final.iloc[:, 3])

r2_degree,r2, std_r2 = final.iloc[r2_index, :3]
mae_degree, mae, std_mae = final.iloc[mae_index, 0], final.iloc[mae_index, 3], final.iloc[mae_index, 4]

print("R_squred based best model:")
print(f"Degree: {r2_degree}, R-Squared: {r2:.4f}, Std R-Squared: {std_r2:.4f}")

print("MAE based best model")
print(f"Degree: {mae_degree}, MAE: {mae:.4f}, Std MAE: {std_mae:.4f}")

R_squred based best model:
Degree: 1.0, R-Squared: 0.4823, Std R-Squared: 0.0493
MAE based best model
Degree: 1, MAE: 44.2765, Std MAE: 2.1001
