# Diabaties Prediction Model - Using DTC

In [1]:
from sklearn.datasets import load_diabetes

In [2]:
dataset = load_diabetes()

In [4]:
print(dataset["DESCR"])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

In [5]:
dataset #it has 3 key value pair one is data , target

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [10]:
import pandas as pd
df_diabetes = pd.DataFrame(dataset.data, columns=['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'])
df_diabetes.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [11]:
X= df_diabetes
y = dataset['target']

In [12]:
y

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25, random_state=34)

In [15]:
X_train.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
142,0.041708,0.05068,0.012117,0.039087,0.054845,0.044406,0.00446,-0.002592,0.045604,-0.001078
289,-0.074533,0.05068,0.055229,-0.040099,0.053469,0.053174,-0.043401,0.07121,0.061238,-0.034215
200,0.056239,-0.044642,-0.057941,-0.007977,0.052093,0.049103,0.056003,-0.021412,-0.028323,0.044485
345,0.081666,0.05068,-0.002973,-0.033213,0.042462,0.057871,-0.010266,0.034309,-0.000612,-0.001078
269,0.009016,-0.044642,-0.032073,-0.026328,0.042462,-0.010395,0.159089,-0.076395,-0.011897,-0.038357


In [16]:
X_test.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
364,0.001751,0.05068,-0.006206,-0.019442,-0.009825,0.004949,-0.039719,0.034309,0.014821,0.098333
84,0.001751,-0.044642,-0.039618,-0.100934,-0.029088,-0.030124,0.044958,-0.050195,-0.068332,-0.129483
236,0.027178,-0.044642,0.006728,0.035644,0.079612,0.07071,0.015505,0.034309,0.040673,0.011349
288,0.070769,0.05068,-0.016984,0.021872,0.043837,0.056305,0.037595,-0.002592,-0.070209,-0.017646
231,0.009016,-0.044642,-0.030996,0.021872,0.008063,0.008707,0.00446,-0.002592,0.009434,0.011349


In [17]:
X_train.corr()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
age,1.0,0.139167,0.183018,0.357224,0.27889,0.196022,-0.007456,0.168069,0.293508,0.284105
sex,0.139167,1.0,0.112424,0.232966,0.031828,0.133613,-0.376716,0.320404,0.154718,0.156909
bmi,0.183018,0.112424,1.0,0.371625,0.275877,0.295822,-0.406494,0.455989,0.465193,0.398641
bp,0.357224,0.232966,0.371625,1.0,0.276923,0.212757,-0.166399,0.25403,0.392423,0.386502
s1,0.27889,0.031828,0.275877,0.276923,1.0,0.890796,0.014899,0.547175,0.525773,0.320689
s2,0.196022,0.133613,0.295822,0.212757,0.890796,1.0,-0.227625,0.65488,0.324159,0.25244
s3,-0.007456,-0.376716,-0.406494,-0.166399,0.014899,-0.227625,1.0,-0.755785,-0.434896,-0.249625
s4,0.168069,0.320404,0.455989,0.25403,0.547175,0.65488,-0.755785,1.0,0.643646,0.382692
s5,0.293508,0.154718,0.465193,0.392423,0.525773,0.324159,-0.434896,0.643646,1.0,0.482637
s6,0.284105,0.156909,0.398641,0.386502,0.320689,0.25244,-0.249625,0.382692,0.482637,1.0


In [19]:
import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
import seaborn as sns
plt.figure(figsize=(15,10))
sns.heatmap(X_train.corr(), annot=True)

<Axes: >

In [21]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)

## Hyperparameter Tuning

In [22]:
params = {
    "criterion" : ["squared_error","friedman_mse", "absolute_error"], 
    "splitter":["best","random"], 
    "max_depth":[1,2,3,4,5, 10, 15, 20, 25],
    "max_features":["auto", "sqrt", "log2"] 
}

In [None]:
regressor = DecisionTreeRegressor()

In [23]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(regressor, param_grid=params, cv = 5, scoring='neg_mean_squared_error')


In [24]:
import warnings
warnings.filterwarnings('ignore')
grid.fit(X_train, y_train)


In [25]:
grid.best_params_

{'criterion': 'squared_error',
 'max_depth': 2,
 'max_features': 'log2',
 'splitter': 'best'}

In [26]:
grid.best_score_

-3990.5328588287375

In [27]:
y_pred = grid.predict(X_test)

In [28]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [29]:
print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.15535023474610188
52.73536565539996
4333.309275155425


In [30]:
selectedModel = DecisionTreeRegressor(criterion='squared_error', max_depth=2, max_features='log2', splitter='best')

In [31]:
selectedModel.fit(X_train, y_train)

In [32]:
from sklearn import tree
plt.figure(figsize=(15,10))
tree.plot_tree(selectedModel, filled=True)

[Text(0.5, 0.8333333333333334, 'x[8] <= -0.008\nsquared_error = 6115.173\nsamples = 331\nvalue = 156.695'),
 Text(0.25, 0.5, 'x[2] <= 0.012\nsquared_error = 3163.007\nsamples = 148\nvalue = 106.784'),
 Text(0.375, 0.6666666666666667, 'True  '),
 Text(0.125, 0.16666666666666666, 'squared_error = 2139.722\nsamples = 122\nvalue = 94.82'),
 Text(0.375, 0.16666666666666666, 'squared_error = 4141.302\nsamples = 26\nvalue = 162.923'),
 Text(0.75, 0.5, 'x[2] <= 0.018\nsquared_error = 4858.69\nsamples = 183\nvalue = 197.06'),
 Text(0.625, 0.6666666666666667, '  False'),
 Text(0.625, 0.16666666666666666, 'squared_error = 3770.979\nsamples = 99\nvalue = 168.697'),
 Text(0.875, 0.16666666666666666, 'squared_error = 4075.083\nsamples = 84\nvalue = 230.488')]