### Importing Libraries

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv("Dataset/diabetes_prediction_dataset.csv")

In [3]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [4]:
df.tail()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0
99999,Female,57.0,0,0,current,22.43,6.6,90,0


In [5]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [6]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [7]:
df.duplicated().sum()

3854

In [8]:
df.drop_duplicates(inplace = True)

In [9]:
df.duplicated().sum()

0

### Machine Learning

In [18]:
df['age_cat'] = pd.cut(df['age'], bins = [0, 18, 36, 72, np.inf], labels = [1, 2, 3, 4])
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_set, test_set in split.split(df, df['age_cat']):
    strat_train = df.iloc[train_set]
    strat_test = df.iloc[test_set]

### Copying the Training Set

In [26]:
diabetes_train = strat_train.copy()

### Separating Features and Labels

In [27]:
diabetes_features_train = diabetes_train.drop(['diabetes', 'age_cat'], axis = 1) 
diabetes_label_train = diabetes_train['diabetes']

### Separating Numerical and Categorical Values

In [31]:
diabetes_num_train = ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level']
diabetes_cat_train = ['gender', 'smoking_history']

### Let's Make Pipelines

In [36]:
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy = "median")),
    ("scaler", StandardScaler()),
])
cat_pipeline = Pipeline([
    ("onhot", OneHotEncoder(handle_unknown = "ignore")),
])
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, diabetes_num_train),
    ("cat", cat_pipeline, diabetes_cat_train),
])

### Transforming the Data

In [37]:
diabetes_prepared_train = full_pipeline.fit_transform(diabetes_features_train)

### Training the Models

In [42]:
logistic_model = LogisticRegression()
print("Training the Model............")
logistic_model.fit(diabetes_prepared_train, diabetes_label_train)
logistic_predict_train = logistic_model.predict(diabetes_prepared_train)
logistic_accuracy_train = accuracy_score(diabetes_label_train, logistic_predict_train)
logistic_precision_train = precision_score(diabetes_label_train, logistic_predict_train)
logistic_f1_train = f1_score(diabetes_label_train, logistic_predict_train)
logistic_recall_train = recall_score(diabetes_label_train, logistic_predict_train)

print("Logistic Regression Model for Training Data-:")
print(f"\nAccuracy Score:{logistic_accuracy_train:.2f}")
print(f"Precision Score:{logistic_precision_train:.2f}")
print(f"Recall Score:{logistic_recall_train:.2f}")
print(f"F1 Score:{logistic_f1_train:.2f}")

Training the Model............
Logistic Regression Model for Training Data-:

Accuracy Score:0.96
Precision Score:0.86
Recall Score:0.63
F1 Score:0.73


In [46]:
forest_model = RandomForestClassifier(random_state = 42)
print("Training the Model............")
forest_model.fit(diabetes_prepared_train, diabetes_label_train)
forest_predict_train = forest_model.predict(diabetes_prepared_train)
forest_accuracy_train = accuracy_score(diabetes_label_train, forest_predict_train)
forest_precision_train = precision_score(diabetes_label_train, forest_predict_train)
forest_f1_train = f1_score(diabetes_label_train, forest_predict_train)
forest_recall_train = recall_score(diabetes_label_train, forest_predict_train)
forest_CV_train = cross_val_score(forest_model, diabetes_prepared_train, diabetes_label_train, scoring = "accuracy", cv = 10)
print("Random Forest Classifier Model for Training Data-:")
print(f"\nAccuracy Score:{forest_accuracy_train:.2f}")
print(f"Precision Score:{forest_precision_train:.2f}")
print(f"Recall Score:{forest_recall_train:.2f}")
print(f"F1 Score:{forest_f1_train:.2f}")
print("The CV Score for Random Forest Classifier Model is -: ")
print(pd.Series(forest_CV_train).describe())

Training the Model............
Random Forest Classifier Model for Training Data-:

Accuracy Score:1.00
Precision Score:1.00
Recall Score:0.99
F1 Score:1.00
The CV Score for Random Forest Classifier Model is -: 
count    10.000000
mean      0.969187
std       0.001533
min       0.966069
25%       0.968633
50%       0.969122
75%       0.969805
max       0.971399
dtype: float64
