# Step 1: Import Libraries and Load the Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score


# load dataset
diabetes = load_diabetes.load_diabetes(as_frame=True, scaled=False)

x = diabetes.data
y = diabetes.target

print(x.head())
print(y.head())

#optional
# print(diabetes.DESCR)


# Frame the Problem

Predict **diabetes progression one year after baseline**

    -**Target variable:** disease progression(continous)
    -**Features:** 10 physilogical measurements(age, sex, BMI, blood preassure, bloo serum metrics)\
    -**Problem type:** Regression
    -**Evaluation metrics** R², MAE, MAPE

# review this more



In [None]:
# dataset dimensions

print("Feature matrix shape:", x.shape)
print("Target vector shape:", y.shape)


# Step 3: Exploratory Data Analysis

Objectives:
1. Summary statistics
2. Feature distributions
3. Feature-target relationships
4. Correlation matrix
5. Insights for modeling

In [1]:
# Summary statistics

print(x.describe())
print("\nTarget variable statistics:")
print(y.describe())

#Histograms
x.hist(bins=15, figsize=(15, 10))
plt.suptitle("Feature Distributions")
plt.show()



# Scatter plots of features vs target

for col in x.columns:
    plt.figure()
    plt.scatter(x[col], y)
    plt.title(f"{col} vs Disease Progression")
    plt.xlabel(col)
    plt.ylabel("Disease Progression")
    plt.show()


# Correlation matrix

corr_matrix = x.copy()
corr_matrix["target"] = y
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix.corr(), annot=True, cmap="coolwarm", fmt=".2f")

NameError: name 'x' is not defined

# Step 4: Data Cleaning

Checks:
- Missing values
- Outliers
- Feature scaling

Rationale: Ensure reliable modeling and reproducibility

In [None]:
# check for missing values
                                    # (insa)
print("Missing values in features:\n", x.insa().sum())
print("Missing values in target:\n", y.insa().sum())


# Boxplots to inspect outliers

for col in x.columns:
    plt.figure()
    sns.boxplot(x=x[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

# Step 5: Split dataset

- Training: 75%
- Validation: 10%
- Test: 15%

Use of "train_test_split" twice to maintain the proportions and reproducibility with random_state=42"

In [None]:
# Try to find a way without random_state=42

#First split: train (75%) and temp (25%)

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.25, random_state=42)


#Second split: validation 10% and test (15%)
val_size = 0.10/0.25
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=1-val_size, random_state=42)


# Check shapes

print("Training set:", x_train.shape, y_train.shape)
print("Validation set:", x_val.shape, y_val.shape)
print("Test set:", x_test.shape, y_test.shape)


Part 2: Univariate Polynomial Regression

Step 6: Univariate Polynomial Regression on BMI

Use BMI to predict diabetes progression

Polynomial degrees 0-5(six models)
Fit each model ad store results for comparison

In [None]:
# Extract BMI feature

x_train_bmi = x_train[["bmi"]]
x_val_bmi = x_val[["bmi"]]
x_test_bmi = x_test[["bmi"]]

# Store models and results

models = {}
results = []


for degree in range(6):
    poly = PolynomialFeatures(degree=degree, include_bias=True)
    x_train_poly = poly.fit_transform(x_train_bmi)
    x_val_poly = poly.fit_transform(x_val_bmi)

    model= LinearRegression()
    model.fit(x_train_poly, y_train)

    models[degree] = (model, poly)

    y_train_pred = model.predict(x_train_poly)
    y_val_pred = model.predict(x_val_poly)

    results.append({
        "Degree": degree,
        "Train R2": r2_score(y_train, y_train_pred),
        "Validation R2": r2_score(y_val, y_val_pred),
        "Train MAE": mean_absolute_error(y_train, y_train_pred),
        "Validation MAE": mean_absolute_error(y_val, y_val_pred),
        "Train MAPE": np.mean(np.abs((y_train - y_train_pred) / y_train)) *100,
        "Validation MAPE": np.mean(np.abs((y_val - y_val_pred) / y_val)) *100,
    })

# Step 7: Compare Polynomial Models

Summary of performance (R², MAE, MAPE) for each polynomial degree.

In [4]:
results_df = pd.DataFrame(results)
results_df = results_df[["Degree", "Train R2", "Validation R2", "Train MAE", "Validation MAE", "Train MAPE", "Validation MAPE" ]]
print(results_df)

NameError: name 'pd' is not defined

# Step 8 Identify the Best Model

Choose the model wit the highest validation R² while keep errors reasonable

In [None]:
best_degree = results_df.loc[results_df["Validation R2"].idxmax(), "Degree"]
best_model, best_poly = models[best_degree]
print(f"Best polynomial degree: {best_degree}")



# Step 9: Evaluate on Test set

In [None]:
x_test_poly = best_poly.transform(x_test_bmi)
y_test_pred = best_model.predict(x_test_poly)

test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mape = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100

print(f"Test R2: {test_r2:.3f}, Test MAE: {test_mae:.3f}, Test MAPE: {test_mape:.2f}%")

# Step 10: Plot Model

Train, Validation, test points with fitted curve

In [None]:
bmi_range = np.linspace(x_bmi.min(), x_bmi.max(), 100).reshape(-1.1)
bmi_range_poly = best_poly.transform(bmi_range)
pred_curve = best_model.predict(bmi_range_poly)

plt.figure(figsize=(10,6))
plt.scatter(x_train_bmi, y_train, color="blue", label="Train")
plt.scatter(x_val_bmi, y_val, color='green', label='Validation')
plt.scatter(x_test_bmi, y_test, color='red', label='Test')
plt.plot(bmi_range, pred_curve, color='black', linewidth=2, label=f'Polynomial Degree {best_degree} Fit')
plt.xlabel("BMI")
plt.ylabel("Disease Progression")
plt.title("Best Univariate Polynomial Regression Fit")
plt.legend()
plt.show()

Step 11: Equation of Best Model

In [None]:
coefs = best_model.coef_
intercept = best_model.intercept_
terms = [f"{intercept:.2f}"] + [f"{coefs[i]:.2f}*x^{i}" for i in range (1, len(coefs))]
equation = " + ".join(terms)
print(f"Equation: y = {equation}")
#highlight

# Step 12: Predict Diabetes Progression for a BMI value



In [None]:
chosen_bmi = np.array([30])
chosen_bmi_poly = best_poly.transform(chosen_bmi)
predicted_progression = best_model.predict(chosen_bmi_poly)
print(f"Predicted progression for BMI=30: {predicted_progression[0]:.2f}")

# Step 13: Trainable parameters

Number of trainable coefficients equals number of polynomial features

In [None]:
print("Polynomial features:", best_poly.get_features_names_out())
print(f"Number of trainable parameters: {len(best_poly.get_feature_names_out())}")

#highlight

# Step 14: Conclusion

- Best degree: {best_degree}, with metrics summarized above.
- Limitations: univariate model ignores interactions with other features.
- Errors may be large for extreme BMI values.
- Takeaways: compare train/validation performance, consider multivariate models for better predictions.
#HIghliht do

#Step 15: Multivaraite Polynomial Regression

Use features to build polynomial regression models (degrees 2 and 3).
Evaluate R², MAE, and MAPE on train, validation, and test sets.