## **📦 Step 1: Import Libraries**

In [8]:
import kagglehub, os, shutil
import pandas as pd

## **📥 Step 2: Download Diabetes Dataset from Kaggle**

In [6]:
data_folder = os.path.join("..", "data")
# Create the custom directory if it doesn't exist
os.makedirs(data_folder, exist_ok=True)
dataset_path = kagglehub.dataset_download("imtkaggleteam/diabetes")
print("✅ Dataset downloaded to temporary path:", dataset_path)

✅ Dataset downloaded to temporary path: /home/muhammad/.cache/kagglehub/datasets/imtkaggleteam/diabetes/versions/1


In [9]:
# Move downloaded files to custom path
for item in os.listdir(dataset_path):
    src = os.path.join(dataset_path, item)
    dst = os.path.join(data_folder, item)
    if os.path.isdir(src):
        shutil.copytree(src, dst, dirs_exist_ok=True)
    else:
        shutil.copy2(src, dst)
    print(f"Moved {item} to {dst}")

Moved diabetes.csv to ../data/diabetes.csv


## **📂 Step 3: Load CSV File**
- **The downloaded path will contain the dataset file**

In [18]:
data = pd.read_csv(os.path.join("..", "data", "diabetes.csv"))

## **🧠 Step 4: Quick Overview**

In [15]:
print("🔢 Shape of Dataset:", data.shape)

🔢 Shape of Dataset: (403, 19)


In [16]:
print("\n📄 Columns:")
print(data.columns.tolist())


📄 Columns:
['id', 'chol', 'stab.glu', 'hdl', 'ratio', 'glyhb', 'location', 'age', 'gender', 'height', 'weight', 'frame', 'bp.1s', 'bp.1d', 'bp.2s', 'bp.2d', 'waist', 'hip', 'time.ppn']


## **👀 Step 5: Preview the Dataset**

In [17]:
data.head()

Unnamed: 0,id,chol,stab.glu,hdl,ratio,glyhb,location,age,gender,height,weight,frame,bp.1s,bp.1d,bp.2s,bp.2d,waist,hip,time.ppn
0,1000,203.0,82,56.0,3.6,4.31,Buckingham,46,female,62.0,121.0,medium,118.0,59.0,,,29.0,38.0,720.0
1,1001,165.0,97,24.0,6.9,4.44,Buckingham,29,female,64.0,218.0,large,112.0,68.0,,,46.0,48.0,360.0
2,1002,228.0,92,37.0,6.2,4.64,Buckingham,58,female,61.0,256.0,large,190.0,92.0,185.0,92.0,49.0,57.0,180.0
3,1003,78.0,93,12.0,6.5,4.63,Buckingham,67,male,67.0,119.0,large,110.0,50.0,,,33.0,38.0,480.0
4,1005,249.0,90,28.0,8.9,7.72,Buckingham,64,male,68.0,183.0,medium,138.0,80.0,,,44.0,41.0,300.0


## **🔍 Step 6: Data Types and Non-Null Counts**

In [19]:
print("\n🧾 Info:")
data.info()


🧾 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        403 non-null    int64  
 1   chol      402 non-null    float64
 2   stab.glu  403 non-null    int64  
 3   hdl       402 non-null    float64
 4   ratio     402 non-null    float64
 5   glyhb     390 non-null    float64
 6   location  403 non-null    object 
 7   age       403 non-null    int64  
 8   gender    403 non-null    object 
 9   height    398 non-null    float64
 10  weight    402 non-null    float64
 11  frame     391 non-null    object 
 12  bp.1s     398 non-null    float64
 13  bp.1d     398 non-null    float64
 14  bp.2s     141 non-null    float64
 15  bp.2d     141 non-null    float64
 16  waist     401 non-null    float64
 17  hip       401 non-null    float64
 18  time.ppn  400 non-null    float64
dtypes: float64(13), int64(3), object(3)
memory usage: 59.9+ KB


## **📊 Step 7: Summary Statistics**

In [20]:
print("\n📈 Summary Statistics:")
display(data.describe(include='all'))


📈 Summary Statistics:


Unnamed: 0,id,chol,stab.glu,hdl,ratio,glyhb,location,age,gender,height,weight,frame,bp.1s,bp.1d,bp.2s,bp.2d,waist,hip,time.ppn
count,403.0,402.0,403.0,402.0,402.0,390.0,403,403.0,403,398.0,402.0,391,398.0,398.0,141.0,141.0,401.0,401.0,400.0
unique,,,,,,,2,,2,,,3,,,,,,,
top,,,,,,,Louisa,,female,,,medium,,,,,,,
freq,,,,,,,203,,234,,,184,,,,,,,
mean,15978.310174,207.845771,106.672457,50.445274,4.521642,5.589769,,46.851117,,66.020101,177.59204,,136.904523,83.321608,152.382979,92.524823,37.900249,43.0399,341.25
std,11881.122124,44.445557,53.076655,17.262626,1.727886,2.242595,,16.312333,,3.918515,40.340666,,22.741033,13.589227,21.712952,11.555198,5.729313,5.656713,309.540953
min,1000.0,78.0,48.0,12.0,1.5,2.68,,19.0,,52.0,99.0,,90.0,48.0,110.0,60.0,26.0,30.0,5.0
25%,4792.5,179.0,81.0,38.0,3.2,4.38,,34.0,,63.0,151.0,,121.25,75.0,138.0,84.0,33.0,39.0,90.0
50%,15766.0,204.0,89.0,46.0,4.2,4.84,,45.0,,66.0,172.5,,136.0,82.0,149.0,92.0,37.0,42.0,240.0
75%,20336.0,230.0,106.0,59.0,5.4,5.6,,60.0,,69.0,200.0,,146.75,90.0,161.0,100.0,41.0,46.0,517.5


## **🧼 Step 8: Check for Missing Values**

In [None]:
print("\n❓ Missing Values:")
print(data.isnull().sum())