In [1]:
%matplotlib inline
import pandas as pd

In [4]:
diabetes_df = pd.read_csv('/content/diabetes_clean.csv')

# Trying a supervised model

In [5]:
data = diabetes_df.copy()
for i, column in enumerate(list([str(d) for d in data.dtypes])):
    if column == "object":
        data[data.columns[i]] = data[data.columns[i]].fillna(data[data.columns[i]].mode())
        data[data.columns[i]] = data[data.columns[i]].astype("category").cat.codes
    else:
        data[data.columns[i]] = data[data.columns[i]].fillna(data[data.columns[i]].median())

In [6]:
data.head(5)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [8]:
y = data.pop('diabetes')
X = data.copy()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
clf = RandomForestRegressor(n_estimators=1000, max_depth = 4)

In [11]:
clf.fit(X_train, y_train)

In [12]:
from sklearn.metrics import r2_score

In [13]:
r2_score(y_test, clf.predict(X_test))

0.667570506679762

# Trying a not supervisated model

In [28]:
import numpy as np
from scipy.stats import norm

In [29]:
params = norm.fit(diabetes_df['diabetes'])

In [30]:
params

(0.085, 0.27888169534768686)

In [33]:
diabetes_df['prob_total'] = [norm(params[0], params[1]).cdf(x) for x in diabetes_df['diabetes']]

In [34]:
diabetes_df[diabetes_df['prob_total']>0.95]

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,prob_total
6,Female,44.0,0,0,0.999483,19.31,6.5,200,1,0.999483
26,Male,67.0,0,1,0.999483,27.32,6.5,200,1,0.999483
38,Male,50.0,1,0,0.999483,27.32,5.7,260,1,0.999483
40,Male,73.0,0,0,0.999483,25.91,9.0,160,1,0.999483
53,Female,53.0,0,0,0.999483,27.32,7.0,159,1,0.999483
...,...,...,...,...,...,...,...,...,...,...
99935,Female,65.0,1,1,0.999483,33.55,8.2,140,1,0.999483
99938,Male,55.0,0,1,0.999483,30.42,6.2,300,1,0.999483
99957,Female,61.0,0,0,0.999483,34.45,6.5,280,1,0.999483
99962,Female,58.0,1,0,0.999483,38.31,7.0,200,1,0.999483


In [38]:
models_hypertension={}
for hypertension in list(diabetes_df['hypertension'].unique()):
    hypertension_diabetes = diabetes_df[diabetes_df['hypertension'] == hypertension]
    params = norm.fit(hypertension_diabetes['diabetes'])
    models_hypertension[hypertension] = norm(params[0], params[1])

In [39]:
hypertension_prob = []
for i, row in diabetes_df.iterrows():
    hypertension_prob.append(models_hypertension[row['hypertension']].cdf(row['diabetes']))
diabetes_df['PROB_hypertension'] = hypertension_prob

In [40]:
diabetes_df[diabetes_df['PROB_hypertension']>0.95]

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,prob_total,PROB_hypertension
6,Female,44.0,0,0,0.999483,19.31,6.5,200,1,0.999483,0.999876
26,Male,67.0,0,1,0.999483,27.32,6.5,200,1,0.999483,0.999876
40,Male,73.0,0,0,0.999483,25.91,9.0,160,1,0.999483,0.999876
53,Female,53.0,0,0,0.999483,27.32,7.0,159,1,0.999483,0.999876
55,Male,50.0,0,0,0.999483,37.16,9.0,159,1,0.999483,0.999876
...,...,...,...,...,...,...,...,...,...,...,...
99919,Female,74.0,0,0,0.999483,29.51,8.2,130,1,0.999483,0.999876
99929,Female,37.0,0,0,0.999483,36.87,8.8,160,1,0.999483,0.999876
99938,Male,55.0,0,1,0.999483,30.42,6.2,300,1,0.999483,0.999876
99957,Female,61.0,0,0,0.999483,34.45,6.5,280,1,0.999483,0.999876
