Importing dataset from kaggle


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay


import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv(r'C:\Users\nares\OneDrive\Documents\PythonPrograms\Diabetes_Prediction\diabetes_prediction_dataset.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
df['smoking_history'].unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [4]:
df.describe()
# df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


## Removing Duplicate rows

In [5]:
df_duplicates = df[df.duplicated() == True]
df_duplicates.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
2756,Male,80.0,0,0,No Info,27.32,6.6,159,0
3272,Female,80.0,0,0,No Info,27.32,3.5,80,0
3418,Female,19.0,0,0,No Info,27.32,6.5,100,0
3939,Female,78.0,1,0,former,27.32,3.5,130,0
3960,Male,47.0,0,0,No Info,27.32,6.0,200,0


In [6]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()


0

# Value Counts

In [7]:
# for i in (df['gender','smoking_history']):
#     print(df[i].value_counts())

print(df['gender'].value_counts()) 
print("\n-------******-------")
print(df['smoking_history'].value_counts())


Female    56161
Male      39967
Other        18
Name: gender, dtype: int64

-------******-------
never          34398
No Info        32887
former          9299
current         9197
not current     6367
ever            3998
Name: smoking_history, dtype: int64


# **Label Encoder for GENDER & SMOKING HISTORY**

In [8]:
# le = LabelEncoder()
# df['gender']= le.fit_transform(df['gender'])
# df['smoking_history']= le.fit_transform(df['smoking_history'])
# df.head()

le_gender = LabelEncoder()
df['gender'] = le_gender.fit_transform(df['gender'])
 
le_hypertension = LabelEncoder()
df['hypertension'] = le_hypertension.fit_transform(df['hypertension'])

le_heart_disease = LabelEncoder()
df['heart_disease'] = le_heart_disease.fit_transform(df['heart_disease'])

le_smoking_history = LabelEncoder()
df['smoking_history'] = le_smoking_history.fit_transform(df['smoking_history'])
df = df.astype(float)
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 96146 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               96146 non-null  float64
 1   age                  96146 non-null  float64
 2   hypertension         96146 non-null  float64
 3   heart_disease        96146 non-null  float64
 4   smoking_history      96146 non-null  float64
 5   bmi                  96146 non-null  float64
 6   HbA1c_level          96146 non-null  float64
 7   blood_glucose_level  96146 non-null  float64
 8   diabetes             96146 non-null  float64
dtypes: float64(9)
memory usage: 7.3 MB


In [9]:
 X = df[df.columns[:-1]].values
 y = df[df.columns[-1]].values

In [10]:
X_train , X_test, y_train, y_test = train_test_split(X,y,random_state=0,train_size=0.8)
print('X_train: ',X_train.shape)
print('X_test: ',X_test.shape)
print('y_train: ',y_train.shape)
print('y_test: ',y_test.shape)

X_train:  (76916, 8)
X_test:  (19230, 8)
y_train:  (76916,)
y_test:  (19230,)


# **Logistic Regression**


In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
lg = LogisticRegression()
lg = lg.fit(X_train, y_train)

In [13]:
y_pred = lg.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98     17562
         1.0       0.84      0.62      0.72      1668

    accuracy                           0.96     19230
   macro avg       0.90      0.81      0.85     19230
weighted avg       0.95      0.96      0.95     19230



In [20]:
features = np.array([[ 0 ,  65 ,    0 ,   1  ,   0  ,  27.32,   6.5 , 200]])
prediction = lg.predict(features)
print("Prediciton: {}".format(prediction))

Prediciton: [1.]


# **KNN**

In [53]:
y_pred = knn_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97     17562
           1       0.86      0.53      0.66      1668

    accuracy                           0.95     19230
   macro avg       0.91      0.76      0.82     19230
weighted avg       0.95      0.95      0.95     19230



# **Neural Network**

In [54]:
import tensorflow as tf

In [55]:
 nn_model = tf.keras.Sequential()
#       tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(9,)),
#       tf.keras.layers.Dropout(dropout_prob),
#       tf.keras.layers.Dense(num_nodes, activation='relu'),
#       tf.keras.layers.Dropout(dropout_prob),
#       tf.keras.layers.Dense(1, activation='sigmoid')
#   ])

nn_model.add(tf.keras.layers.Dense(units=9,activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1,activation="sigmoid"))
nn_model.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])  
#loss:- specifies which loss function should be used. For binary classification, the value should be binary_crossentropy. For multiclass classification, it should be categorical_crossentropy.


In [56]:
nn_model.fit(X_train,y_train,batch_size=32,epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1ec7d9c2810>

## Input from USER

In [32]:
#0 ,  65 ,    0 ,   1  ,   0  ,  27.32,   6.5 , 200
X = np.array([['Male',65.0,'No','Yes','never',31.94,7.43,200,]])
X

array([['Male', '65.0', 'No', 'Yes', 'never', '31.94', '7.43', '200']],
      dtype='<U32')

In [33]:
X[:, 0]= le_gender.fit_transform(X[:,0])
X[:, 2]= le_hypertension.fit_transform(X[:,2])
X[:, 3]= le_heart_disease.fit_transform(X[:,3])
X[:, 4]= le_smoking_history.fit_transform(X[:,4])
X = X.astype(float)
X

array([[  0.  ,  65.  ,   0.  ,   0.  ,   0.  ,  31.94,   7.43, 200.  ]])

In [36]:
y_predforX = lg.predict(X)
print(y_predforX)
if 0 in y_predforX:
    print("NO")
else:
    print('Yes')


# features = np.array([[0,45,1,0,3,26,4.6,145]])
# prediction = lg.predict(features)
# print("Prediciton: {}".format(prediction))

[1.]
Yes


In [28]:
import pickle

In [29]:
data = {'model': lg, 'le_gender':le_gender, 'le_hypertension': le_hypertension, 'le_heart_disease': le_heart_disease, 'le_smoking_history': le_smoking_history}
with open('saved_steps.pkl', 'wb') as file:
   pickle.dump(data,file)

In [30]:
with open('saved_steps.pkl','rb') as file:
    data = pickle.load(file)

lg_loaded = data['model']
le_gender = data['le_gender']
le_hypertension = data['le_hypertension']
le_heart_disease = data['le_heart_disease']
le_smoking_history = data['le_smoking_history']

In [31]:
y_pred = lg_loaded.predict(X)
y_pred

array([0.])