# Predict Diabetes Progression

# 00 Import module, load data

In [1]:
from sklearn.datasets import load_diabetes
import sklearn.metrics as metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 01 Data Preprocessing, train-test split

In [2]:
diabetes = load_diabetes()
features = diabetes['data']
feature_names = diabetes['feature_names']
label = diabetes['target']

In [3]:
df = pd.DataFrame(features, columns=feature_names)
df['target'] = label

In [4]:
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


In [5]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [6]:
X

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]])

In [7]:
y

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)



# 02 kNN

In [14]:
from sklearn.neighbors import KNeighborsRegressor
#1. model instance 만들기

#2. 만든 instance에 training data 넣어서 fit 하기

#3. model instance에 X_test 넣어서 y값 prediction하기

#3. metric 구하기
neigh = KNeighborsRegressor()
neigh.fit(X_train, y_train)
neigh_pred = neigh.predict(X_test)



In [15]:
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
param_grid = {
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : [1, 5, 10, 20, 30],
    'metric' : [2, 5, 10, 20, 30, 'minkowski'],
    'metric_params' : [None],
    'n_jobs' : [1, 2, 5, 10, 20, 30],
    'n_neighbors' : [1, 2, 5, 10],
    'p' : [1, 2, 5],
    'weights' : ['uniform', 'distance']
}

rf2 = KNeighborsRegressor()

kfold2 = KFold(n_splits=10)

grid_search = GridSearchCV(estimator = rf2,
                           param_grid = param_grid,
                           cv = kfold2)

grid_search.fit(features, label)

144000 fits failed out of a total of 172800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
28800 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/neighbors/_regression.py", line 215, in fit
    self._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/

In [16]:
grid_search.best_params_

{'algorithm': 'auto',
 'leaf_size': 1,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 10,
 'p': 5,
 'weights': 'uniform'}

In [17]:
neigh_best = KNeighborsRegressor(
    algorithm = 'auto',
     leaf_size= 1,
     metric= 'minkowski',
     metric_params= None,
     n_jobs= 1,
     n_neighbors= 10,
     p= 5,
     weights= 'uniform'
)

In [18]:

neigh_best.fit(X_train, y_train)
neigh_best_pred = neigh_best.predict(X_test)

# 02-1 kNN evaluation

In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_Knn = mean_squared_error(y_test, neigh_pred)
mae_Knn = mean_absolute_error(y_test, neigh_pred)
r2_Knn = r2_score(y_test, neigh_pred)

print("Mean Squared Error: ", mse_Knn)
print("Mean Absolute Error: ", mae_Knn)
print("R^2 Score: ", r2_Knn)

Mean Squared Error:  4676.864719101123
Mean Absolute Error:  53.52808988764045
R^2 Score:  0.13193209463754896


In [20]:
mse_Knn = mean_squared_error(y_test, neigh_best_pred)
mae_Knn = mean_absolute_error(y_test, neigh_best_pred)
r2_Knn = r2_score(y_test, neigh_best_pred)

print("Mean Squared Error: ", mse_Knn)
print("Mean Absolute Error: ", mae_Knn)
print("R^2 Score: ", r2_Knn)

Mean Squared Error:  3984.1984269662917
Mean Absolute Error:  50.24044943820224
R^2 Score:  0.26049715123901995


# 03 decision Tree

In [21]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import graphviz
import os
os.environ['PATH'] += os.pathsep+'/opt/homebrew/bin'

In [22]:
tree_model = DecisionTreeRegressor()

In [23]:
tree_model.fit(X_train, y_train)

In [24]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

In [25]:
feature_names = df.columns[:-1]

dot_data = export_graphviz(
    tree_model, 
    out_file=None, 
    feature_names=feature_names,
    rounded=True, 
    filled=True 
)

In [26]:
graph = graphviz.Source(dot_data)

# 03-1 decision tree evaluation

In [27]:
mae_tree = metrics.mean_absolute_error(y_test, tree_pred)
mse_tree = metrics.mean_squared_error(y_test, tree_pred)
r2_tree = metrics.r2_score(y_test, tree_pred)

print("Mean Squared Error: ", mae_tree)
print("Mean Absolute Error: ", mse_tree)
print("R^2 Score: ", r2_tree)

Mean Squared Error:  59.58426966292135
Mean Absolute Error:  5749.29213483146
R^2 Score:  -0.06712002175662035


# 04 Random Forest

In [28]:
from sklearn.ensemble import RandomForestRegressor

In [29]:
rf = RandomForestRegressor()

In [30]:
rf

In [31]:
rf.fit(X_train, y_train)

In [32]:
Rand_pred = rf.predict(X_test)

In [33]:
Rand_pred

array([155.72, 102.57, 233.04, 249.86, 113.25,  81.03, 200.39, 210.42,
        77.39, 113.97, 224.1 , 139.04, 121.49,  69.93,  94.26, 157.58,
       144.23, 127.72,  92.68, 214.52, 152.26, 197.  , 156.45, 225.05,
        96.5 , 205.86, 251.91, 185.48, 142.71,  82.52,  84.74, 112.88,
       189.8 , 133.73, 246.93, 193.38, 137.28, 109.73, 235.8 , 143.68,
        70.91, 123.52, 173.02, 130.99, 262.97, 146.16, 254.54, 105.17,
       116.19, 287.54, 135.85,  82.24, 137.45, 279.17, 156.71, 163.75,
       103.38, 108.52,  79.53, 167.64, 184.39,  84.64, 225.84, 151.38,
       209.73, 160.54, 130.36,  87.17, 174.13, 201.48,  90.17,  90.37,
       160.58, 162.2 ,  90.03, 252.81, 166.41, 117.81, 164.47, 110.8 ,
       110.35, 130.47,  85.89, 127.97, 195.36, 176.45, 101.95, 125.08,
       169.86])

# 04-1 Random Forest evaluation

In [34]:
mae_Rand = metrics.mean_absolute_error(y_test, Rand_pred)
mse_Rand = metrics.mean_squared_error(y_test, Rand_pred)
r2_Rand = metrics.r2_score(y_test, Rand_pred)

print("Mean Squared Error: ", mae_Rand)
print("Mean Absolute Error: ", mse_Rand)
print("R^2 Score: ", r2_Rand)

Mean Squared Error:  51.26898876404494
Mean Absolute Error:  4126.752444943821
R^2 Score:  0.23403785099855068


# 05 XGboost

In [35]:
from xgboost import XGBRegressor

In [36]:
xgb = XGBRegressor()

In [37]:
xgb

In [38]:
xgb.fit(X_train, y_train)

In [39]:
xgb_pred = xgb.predict(X_test)

In [40]:
xgb_pred

array([122.83002 ,  82.674995, 242.28786 , 271.75015 , 126.63893 ,
       100.02073 , 163.32166 , 260.01373 , 101.21188 , 151.44261 ,
       239.27092 ,  75.011856, 143.67567 ,  76.49287 ,  99.78436 ,
       169.09175 , 106.973495, 105.400696,  90.5682  , 196.25833 ,
       195.53218 , 162.17245 , 128.32101 , 230.0978  ,  94.28591 ,
       212.37843 , 249.30421 , 168.77583 , 160.22797 ,  87.25231 ,
        80.47367 ,  88.677734, 153.93669 ,  80.3855  , 275.59525 ,
       228.97536 , 135.28322 , 111.26246 , 282.915   , 174.19736 ,
        67.699715,  74.86253 , 177.56786 , 143.23547 , 192.2294  ,
       116.94037 , 252.24503 ,  88.263115, 124.43905 , 283.75757 ,
        69.79629 ,  90.432304, 138.59074 , 299.88196 , 169.61053 ,
       168.48898 , 100.28019 ,  93.98604 ,  72.45916 , 168.38422 ,
       231.59056 ,  60.374245, 183.46468 , 179.9245  , 223.7297  ,
       129.97034 , 132.74196 ,  88.813324, 195.64511 , 212.59889 ,
        65.32688 ,  88.349724, 140.91386 , 182.98164 , 102.718

# 05-1 XGB evaluation

In [41]:
mae_xgb = metrics.mean_absolute_error(y_test, xgb_pred)
mse_xgb = metrics.mean_squared_error(y_test, xgb_pred)
r2_xgb = metrics.r2_score(y_test, xgb_pred)

print("Mean Squared Error: ", mae_xgb)
print("Mean Absolute Error: ", mse_xgb)
print("R^2 Score: ", r2_xgb)

Mean Squared Error:  55.83969467677427
Mean Absolute Error:  4891.9757563575
R^2 Score:  0.0920055629230766


# 06 Model Comparison

# Week 11 HW

이번 수업 숙제는 HW10에서 hyperparameter tuning까지 하는 거예요!
수업 시간에 알려준 대로 각 모델들의 공식 문서들을 보면서 (구글에 sklearn+모델 이름 치면 나와요) 모델 안에 들어갈 hyperparameter들
뭔지 찾아보고 hyperparameter tuning로 best hyperparameter 찾아서 모델 만들고,
hyperparameter tuning 안 했을 때의 regression metric이랑 비교해볼게요

In [42]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their values
knn_params = {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']}

# Initialize the K-Nearest Neighbors model
knn = KNeighborsRegressor()

# Set up GridSearchCV with the KNN model and the parameters
grid_search_knn = GridSearchCV(knn, knn_params, cv=5)

# Fit the model with the training data
grid_search_knn.fit(X_train, y_train)

# Print the best parameters and the best score
print(grid_search_knn.best_params_)
print(grid_search_knn.best_score_)

{'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.5003068819363339
