In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import os
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import kagglehub

In [94]:
path = kagglehub.dataset_download("iammustafatz/diabetes-prediction-dataset")
print("Path to dataset files:", path)

all_files=os.listdir(path)
display(all_files)
path=path+'/'+all_files[0]
path

Path to dataset files: C:\Users\Maddox\.cache\kagglehub\datasets\iammustafatz\diabetes-prediction-dataset\versions\1


['diabetes_prediction_dataset.csv']

'C:\\Users\\Maddox\\.cache\\kagglehub\\datasets\\iammustafatz\\diabetes-prediction-dataset\\versions\\1/diabetes_prediction_dataset.csv'

In [95]:
data=pd.read_csv(path)
data.sample(2)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
20009,Male,37.0,0,0,not current,43.4,4.5,130,0
57492,Male,18.0,0,0,never,34.4,6.5,130,0


In [96]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [97]:
data.tail()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0
99999,Female,57.0,0,0,current,22.43,6.6,90,0


In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [99]:
data.shape

(100000, 9)

All data is non null however 2 of the rows are object not a number, need to encode them

In [100]:
cols=list(data.columns)
cols

['gender',
 'age',
 'hypertension',
 'heart_disease',
 'smoking_history',
 'bmi',
 'HbA1c_level',
 'blood_glucose_level',
 'diabetes']

In [101]:
object_columns=[]
for col in cols:
    if data[col].dtype=='object':
        print(f"Unique values in {col}: {data[col].unique()}")
        object_columns.append(col)
print("Object columns:", object_columns)

Unique values in gender: ['Female' 'Male' 'Other']
Unique values in smoking_history: ['never' 'No Info' 'current' 'former' 'ever' 'not current']
Object columns: ['gender', 'smoking_history']


In [102]:
# performing one-hot encoding here
gender_dummies=pd.get_dummies(data['gender'], prefix='gender', dtype=int)
#adding the encoded columns here (for male, female and other)
data = pd.concat([data, gender_dummies], axis=1)

#droping the original 'gender' column as it's no longer needed
data = data.drop('gender', axis=1)
data.sample(5)  


Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other
95464,38.0,0,0,No Info,27.32,6.5,200,0,1,0,0
53490,62.0,1,0,never,31.01,4.0,90,0,0,1,0
85480,37.0,0,0,ever,27.32,3.5,159,0,1,0,0
61789,20.0,0,0,ever,30.2,6.0,140,0,0,1,0
42627,21.0,0,0,never,27.32,6.0,159,0,0,1,0


In [103]:
smoking_history_col=list(data['smoking_history'].unique())
smoking_history_col

['never', 'No Info', 'current', 'former', 'ever', 'not current']

In [104]:
smoking_mapping={status: idx for idx, status in enumerate(smoking_history_col)}
display(smoking_history_col)
data['smoking_history_encoded']=data['smoking_history'].map(smoking_mapping)
data.sample(5)

['never', 'No Info', 'current', 'former', 'ever', 'not current']

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_encoded
91675,23.0,0,0,never,22.88,4.5,158,0,0,1,0,0
53446,47.0,0,0,never,24.82,4.5,126,0,1,0,0,0
4069,47.0,0,0,ever,39.01,6.6,130,0,1,0,0,4
99869,41.0,0,0,No Info,27.32,6.0,126,0,1,0,0,1
31647,6.0,0,0,No Info,15.84,6.6,85,0,0,1,0,1


checking the Unique Values in diabetes column

In [None]:
data['diabetes'].unique()

array([0, 1])