# Step 1: Import Libraries and Load the Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error


# load dataset
diabetes = load_diabetes.load_diabetes(as_frame=True, scaled=False)

x = diabetes.data
y = diabetes.target

print(x.head())
print(y.head())

#optional
# print(diabetes.DESCR)


# Frame the Problem

Predict **diabetes progression one year after baseline**

    -**Target variable:** disease progression(continous)
    -**Features:** 10 physilogical measurements(age, sex, BMI, blood preassure, bloo serum metrics)\
    -**Problem type:** Regression
    -**Evaluation metrics** R², MAE, MAPE

# review this more



In [None]:
# dataset dimensions

print("Feature matrix shape:", x.shape)
print("Target vector shape:", y.shape)


# Step 3: Exploratory Data Analysis

Objectives:
1. Summary statistics
2. Feature distributions
3. Feature-target relationships
4. Correlation matrix
5. Insights for modeling

In [1]:
# Summary statistics

print(x.describe())
print("\nTarget variable statistics:")
print(y.describe())

#Histograms
x.hist(bins=15, figsize=(15, 10))
plt.suptitle("Feature Distributions")
plt.show()



# Scatter plots of features vs target

for col in x.columns:
    plt.figure()
    plt.scatter(x[col], y)
    plt.title(f"{col} vs Disease Progression")
    plt.xlabel(col)
    plt.ylabel("Disease Progression")
    plt.show()


# Correlation matrix

corr_matrix = x.copy()
corr_matrix["target"] = y
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix.corr(), annot=True, cmap="coolwarm", fmt=".2f")

NameError: name 'x' is not defined

# Step 4: Data Cleaning

Checks:
- Missing values
- Outliers
- Feature scaling

Rationale: Ensure reliable modeling and reproducibility

In [None]:
# check for missing values
                                    # (insa)
print("Missing values in features:\n", x.insa().sum())
print("Missing values in target:\n", y.insa().sum())


# Boxplots to inspect outliers

for col in x.columns:
    plt.figure()
    sns.boxplot(x=x[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

# Step 5: Split dataset

- Training: 75%
- Validation: 10%
- Test: 15%

Use of "train_test_split" twice to maintain the proportions and reproducibility with random_state=42"

In [None]:
# Try to find a way without random_state=42

#First split: train (75%) and temp (25%)

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.25, random_state=42)


#Second split: validation 10% and test (15%)
val_size = 0.10/0.25
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=1-val_size, random_state=42)


# Check shapes

print("Training set:", x_train.shape, y_train.shape)
print("Validation set:", x_val.shape, y_val.shape)
print("Test set:", x_test.shape, y_test.shape)


Part 2: Univariate Polynomial Regression

Step 6: Univariate Polynomial Regression on BMI

Use BMI to predict diabetes progression

Polynomial degrees 0-5(six models)
Fit each model ad store results for comparison

In [None]:
# Extract BMI feature

x_train_bmi = x_train[["bmi"]]
x_val_bmi = x_val[["bmi"]]
x_test_bmi = x_test[["bmi"]]

# Store models and results

models = {}
results = []


for degree in range(6):
    poly = PolynomialFeatures(degree=degree, include_bias=True)
    x_train_poly = poly.fit_transform(x_train_bmi)
    x_val_poly = poly.fit_transform(x_val_bmi)

    model= LinearRegression()
    model.fit(x_train_poly, y_train)

    models[degree] = (model, poly)

    y_train_pred = model.predict(x_train_poly)
    y_val_pred = model.predict(x_val_poly)

    results.append({
        "Degree": degree,
        "Train R2": r2_score(y_train, y_train_pred),
        "Validation R2": r2_score(y_val, y_val_pred),
        "Train MAE": mean_absolute_error(y_train, y_train_pred),
        "Validation MAE": mean_absolute_error(y_val, y_val_pred),
        "Train MAPE": np.mean(np.abs((y_train - y_train_pred) / y_train)) *100,
        "Validation MAPE": np.mean(np.abs((y_val - y_val_pred) / y_val)) *100,
    })