# Diabetes prediction ML Model

## Importing important Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets

### Loading the data from sklearn dataset

In [2]:
Data = datasets.load_diabetes()

### Converting the dataset in pandas dataframe

In [3]:
## Data.feature_names contains all features
## Data.target contains the labes
Diabetes = pd.DataFrame(Data.data, columns = Data.feature_names)
## Adding the label column to the dataframe
Diabetes = Diabetes.assign(target = Data.target)

### Generating the data information

In [4]:
#Diabetes.info()
#Diabetes.shape
#Diabetes.describe()
## You can see there is no null value in our data , 
## no data is missing hence our data is fine to start

### Plotting the histogram

In [5]:
#Diabetes.hist(bins = 50, figsize = (10, 15))

## Splitting the data in training and testing 
### Again sklearn comes to rescue

In [6]:
from sklearn.model_selection import train_test_split
Train_set, Test_set = train_test_split(Diabetes, test_size = 0.2, random_state = 42)
print(f"Rows in train set : {len(Train_set)}\nRows in test set: {len(Test_set)}\n")

Rows in train set : 353
Rows in test set: 89



### Look for correlations

In [7]:
Correlation = Diabetes.corr()
#Correlation['target'].sort_values(ascending = False)

In [8]:
from pandas.plotting import scatter_matrix
Var = ['target', 'bmi', 's5', 'bp']
#scatter_matrix(Diabetes[Var], figsize = (12, 8))

#### as per the scatter matrix we can se that bmi is most correlated, with increase in bmi diabete risk increases

In [9]:
Training = Train_set.drop('target', axis = 1)
Training_labels = Train_set['target'].copy()

## the data is normalised hence we need not to normalize it

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
#from lightgbm import LGBMRegressor
#from xgboost.sklearn import XGBRegressor
#from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

In [11]:
#model = DecisionTreeRegressor()
#model = LinearRegression()
#model = RandomForestRegressor()
#model = GradientBoostingRegressor()
model = BayesianRidge()
model.fit(Training, Training_labels)

BayesianRidge()

In [12]:
some_data = Training.iloc[:5]
some_labels = Training_labels.iloc[:5]
model.predict(some_data)

array([183.6250789 , 147.52246328, 232.3683276 , 103.12751859,
        66.26325879])

In [13]:
list(some_labels)

[144.0, 150.0, 280.0, 125.0, 59.0]

## Evaluating model

In [14]:
from sklearn.metrics import mean_squared_error
predictions = model.predict(Training)
error = mean_squared_error(Training_labels, predictions)
merror = np.sqrt(error)
merror

53.77321054488471

## Using cross - validation technique

In [15]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, Training, Training_labels, scoring = "neg_mean_squared_error", cv = 10)
rmse_scores = np.sqrt(-scores)

In [17]:
def print_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

In [18]:
print_scores(rmse_scores)

Scores:  [50.40816089 57.86191319 51.52681867 60.31454969 57.80832438 46.98986891
 51.99137564 61.16260384 57.17619809 60.82708069]
Mean:  55.606689399775746
Standard deviation:  4.729390447768342


### Saving the model

In [19]:
from joblib import dump, load
dump(model, 'Diabetes_prediction.loblib')

['Diabetes_prediction.loblib']

## Testing the model

In [26]:
Testing = Test_set.drop('target', axis = 1)
Testing_label = Test_set['target'].copy()
final_prediction = model.predict(Testing)
final_mse = mean_squared_error(Testing_label, final_prediction)
final_rmse = np.sqrt(final_mse)
print(final_prediction, list(Testing_label))

[141.5617479  179.96620906 140.53525685 291.57179878 122.94159211
  95.47131909 255.04724939 187.57930472  86.1364612  112.35726439
  96.11127738 161.45098629  64.16902273 205.57828203 100.7645814
 132.76763569 221.14310461 245.98066464 194.68648192 214.07456256
 205.28792187  90.00453678  73.70817256 188.11287193 155.60992841
 162.21392555 188.85435449 176.84582252  51.24071387 112.39837981
 179.52545636  92.34653105 132.32891287 180.96468991 173.18806678
 190.63760322 123.82718443 119.77420331 147.28318126  62.11014582
  76.56326349 109.22391313 161.54042933 150.90643587 174.48119211
  66.55096445  79.44015776 107.80131061  59.58935236 160.5666605
 155.49248931  66.46358919 114.33903057 108.98074083 170.72617549
 158.43899426  96.06993751 206.473847   118.17323463  70.39417503
 184.71845794 200.98864486 141.48287348 107.02533081 126.88969446
 202.04620341 167.80956319 161.78987853 118.35196773 141.25146549
 179.8105877  195.16286783 234.98178618 143.03571492  83.32076168
 151.9591404