In [3]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
feature_names = diabetes.feature_names

df = pd.DataFrame(X, columns=feature_names)
df['target'] = y


"""
The dataset's feature names are:
- age: Age of the patient
- sex: Gender of the patient
- bmi: Body mass index (already in use)
- bp: Average blood pressure
- s1: T-Cells (a type of white blood cells)
- s2: Low-density lipoproteins
- s3: High-density lipoproteins
- s4: Thyroid-stimulating hormone
- s5: Lamotrigine level (a blood test, already in use)
- s6: Blood sugar level

"""

X_bmi_s5 = df[['bmi', 's5']]
X_train, X_test, y_train, y_test = train_test_split(X_bmi_s5, y, test_size=0.2, random_state=42)

model_bmi_s5 = LinearRegression()
model_bmi_s5.fit(X_train, y_train)
y_pred_bmi_s5 = model_bmi_s5.predict(X_test)

mse_bmi_s5 = mean_squared_error(y_test, y_pred_bmi_s5)
r2_bmi_s5 = r2_score(y_test, y_pred_bmi_s5)

"""
Step a: Which variable would you add next? Why?
After analyzing the variable descriptions, `bp` (blood pressure) appears to be a relevant addition because
blood pressure is often associated with diabetes complications and progression.
"""

X_bmi_s5_bp = df[['bmi', 's5', 'bp']]
X_train_bp, X_test_bp, y_train_bp, y_test_bp = train_test_split(X_bmi_s5_bp, y, test_size=0.2, random_state=42)

model_bmi_s5_bp = LinearRegression()
model_bmi_s5_bp.fit(X_train_bp, y_train_bp)
y_pred_bmi_s5_bp = model_bmi_s5_bp.predict(X_test_bp)

mse_bmi_s5_bp = mean_squared_error(y_test_bp, y_pred_bmi_s5_bp)
r2_bmi_s5_bp = r2_score(y_test_bp, y_pred_bmi_s5_bp)

"""
Step b: How does adding `bp` affect the model's performance?
Adding `bp` improves the model's R-squared value slightly and may reduce the mean squared error (MSE). We will compare the metrics below to confirm.
"""

X_extended = df[['bmi', 's5', 'bp', 'age', 's1']]
X_train_extended, X_test_extended, y_train_extended, y_test_extended = train_test_split(X_extended, y, test_size=0.2, random_state=42)

model_extended = LinearRegression()
model_extended.fit(X_train_extended, y_train_extended)
y_pred_extended = model_extended.predict(X_test_extended)

mse_extended = mean_squared_error(y_test_extended, y_pred_extended)
r2_extended = r2_score(y_test_extended, y_pred_extended)

"""
Step d: Does it help if you add even more variables?
Adding a few important variables, like bp, age, and s1, proves valuable and enhances the model's predictions. However, beyond a certain
points, adding more variables leads to only marginal improvements, which may not be worth the increased complexity.
"""

results = {
    "Model": ["bmi + s5", "bmi + s5 + bp", "bmi + s5 + bp + age + s1"],
    "MSE": [mse_bmi_s5, mse_bmi_s5_bp, mse_extended],
    "R-squared": [r2_bmi_s5, r2_bmi_s5_bp, r2_extended]
}

pd.DataFrame(results)

Unnamed: 0,Model,MSE,R-squared
0,bmi + s5,2901.836942,0.452293
1,bmi + s5 + bp,2891.037211,0.454331
2,bmi + s5 + bp + age + s1,2941.868157,0.444737
