In [100]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('diabetes.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [101]:
# data.hist(bins=50, figsize=(20, 15))
# plt.show()
corr_matrix = data.corr()
corr_matrix['Outcome'].sort_values(ascending=False)

Outcome                     1.000000
Glucose                     0.466581
BMI                         0.292695
Age                         0.238356
Pregnancies                 0.221898
DiabetesPedigreeFunction    0.173844
Insulin                     0.130548
SkinThickness               0.074752
BloodPressure               0.065068
Name: Outcome, dtype: float64

In [102]:
# from pandas.plotting import scatter_matrix
# attributes = ["Outcome", "Glucose"]
# scatter_matrix(data[attributes], figsize = (12,8))
# # data.plot(kind="scatter", x="Glucose", y="Outcome", alpha=0.8)
# # data['Glucose'].value_counts().sort_values(ascending=False)

In [103]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
data=train_set.drop("Outcome", axis=1)
data_labels = train_set["Outcome"].copy()

In [104]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
pipline=Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

In [105]:
data_tr = pipline.fit_transform(data)


In [106]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# model = LinearRegression()
# model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(data_tr, data_labels)


In [107]:
some_data=data.iloc[:100]
some_labels=data_labels.iloc[:100]
some_data_prepared = pipline.transform(some_data)
model.predict(some_data_prepared)

array([0.  , 0.87, 0.18, 0.21, 0.84, 0.  , 0.82, 0.91, 0.09, 0.  , 0.04,
       0.97, 0.  , 0.21, 0.28, 0.93, 0.18, 0.71, 0.94, 0.21, 0.22, 0.07,
       0.28, 0.18, 0.78, 0.89, 0.11, 0.1 , 0.14, 0.94, 0.87, 0.01, 0.  ,
       0.05, 0.92, 0.2 , 0.65, 0.02, 0.89, 0.05, 0.08, 0.  , 0.01, 0.  ,
       0.7 , 0.96, 0.09, 0.  , 0.79, 0.07, 0.3 , 0.02, 0.03, 0.09, 0.88,
       0.82, 0.15, 0.05, 0.04, 0.01, 0.84, 0.01, 0.19, 0.95, 0.14, 0.25,
       0.91, 0.11, 0.  , 0.04, 0.  , 0.34, 0.78, 0.79, 0.12, 0.14, 0.03,
       0.  , 0.05, 0.74, 0.17, 0.19, 0.04, 0.08, 0.8 , 0.94, 0.02, 0.  ,
       0.84, 0.  , 0.04, 0.09, 0.3 , 0.01, 0.25, 0.05, 0.77, 0.01, 0.74,
       0.03])

In [108]:
list(some_labels)

[0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0]

In [109]:
from sklearn.metrics import mean_squared_error
data_predictions = model.predict(data_tr)
mse = mean_squared_error(data_labels, data_predictions)
rmse = np.sqrt(mse)
rmse

np.float64(0.1485790459621504)

In [110]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, data_tr, data_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
rmse_scores

array([0.41828567, 0.36882333, 0.36653346, 0.41355461, 0.43491284,
       0.36514538, 0.39814529, 0.41844756, 0.38890346, 0.42284555])

In [111]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())
print_scores(rmse_scores)

Scores: [0.41828567 0.36882333 0.36653346 0.41355461 0.43491284 0.36514538
 0.39814529 0.41844756 0.38890346 0.42284555]
Mean:  0.3995597151052914
Standard deviation:  0.02455749485612626


In [112]:
from joblib import dump, load
dump(model, 'Diabetes.joblib') 

['Diabetes.joblib']

In [113]:
x_test=test_set.drop("Outcome", axis=1)
y_test=test_set["Outcome"].copy()
x_test_prepared = pipline.transform(x_test)
final_predictions = model.predict(x_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

np.float64(0.4155523628061405)