# Machine Learning Lab - Diabetes Health Indicators Dataset

**Platform Compatible**: Windows | Mac | Google Colab

This notebook analyzes the Diabetes Health Indicators Dataset from Kaggle.

## 1. Environment Setup & Package Installation

In [2]:
import sys
import platform
import os

# Detect environment
def detect_environment():
    if 'google.colab' in sys.modules:
        return 'colab'
    elif platform.system() == 'Windows':
        return 'windows'
    elif platform.system() == 'Darwin':
        return 'mac'
    elif platform.system() == 'Linux':
        return 'linux'
    else:
        return 'unknown'

env = detect_environment()
print(f"Running on: {env}")
print(f"Python version: {sys.version}")
print(f"Platform: {platform.platform()}")

Running on: windows
Python version: 3.14.3 (tags/v3.14.3:323c59a, Feb  3 2026, 16:04:56) [MSC v.1944 64 bit (AMD64)]
Platform: Windows-11-10.0.26200-SP0


In [3]:
# Install required packages if not already installed
import importlib.util

def check_and_install(package_name, import_name=None):
    """Check if package is installed, install if missing"""
    if import_name is None:
        import_name = package_name
    
    if importlib.util.find_spec(import_name) is None:
        print(f"Installing {package_name}...")
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name, "-q"])
        print(f"{package_name} installed successfully!")
    else:
        print(f"{package_name} already installed")

# Core packages
packages = [
    ("pandas", "pandas"),
    ("numpy", "numpy"),
    ("scikit-learn", "sklearn"),
    ("matplotlib", "matplotlib"),
    ("seaborn", "seaborn")
]

for pkg, import_name in packages:
    check_and_install(pkg, import_name)

print("\nAll required packages installed!")

Installing pandas...
pandas installed successfully!
numpy already installed
Installing scikit-learn...
scikit-learn installed successfully!
Installing matplotlib...
matplotlib installed successfully!
Installing seaborn...
seaborn installed successfully!

All required packages installed!


## 2. Import Libraries

In [4]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# File handling
import glob

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# For Colab: set larger figure sizes
if env == 'colab':
    plt.rcParams['figure.figsize'] = (12, 8)
else:
    plt.rcParams['figure.figsize'] = (10, 6)

print("All libraries imported successfully!")

All libraries imported successfully!


## 3. Dataset Overview

**Available datasets** (from Kaggle - Diabetes Health Indicators):
1. `diabetes_012_health_indicators_BRFSS2015.csv` - Multi-class (0=no diabetes, 1=prediabetes, 2=diabetes)
2. `diabetes_binary_health_indicators_BRFSS2015.csv` - Binary classification (original imbalanced)
3. `diabetes_binary_5050split_health_indicators_BRFSS2015.csv` - Binary classification (balanced)

In [5]:
# Get the directory where this notebook is located
notebook_dir = os.path.dirname(os.path.abspath("__file__")) if "__file__" in dir() else os.getcwd()

# List available CSV files in the notebook directory
available_files = glob.glob(os.path.join(notebook_dir, "*.csv"))

print("Available dataset files:")
for i, file in enumerate(available_files, 1):
    file_size = os.path.getsize(file) / (1024 * 1024)  # Size in MB
    print(f"{i}. {os.path.basename(file)} ({file_size:.2f} MB)")

if not available_files:
    print("\nNo CSV files found in the notebook directory!")
    print("Please ensure the dataset files are in the same folder as this notebook.")

Available dataset files:
1. diabetes_012_health_indicators_BRFSS2015.csv (21.68 MB)
2. diabetes_binary_5050split_health_indicators_BRFSS2015.csv (6.05 MB)
3. diabetes_binary_health_indicators_BRFSS2015.csv (21.68 MB)


## 4. Load Data

**Choose which dataset to load based on your analysis needs**

In [6]:
# Option 1: Load the balanced binary dataset (recommended for classification)
data_file = "diabetes_binary_5050split_health_indicators_BRFSS2015.csv"

# Option 2: Load the original imbalanced binary dataset
# data_file = "diabetes_binary_health_indicators_BRFSS2015.csv"

# Option 3: Load the multi-class dataset (0, 1, 2)
# data_file = "diabetes_012_health_indicators_BRFSS2015.csv"

# Load the selected dataset
data_path = os.path.join(notebook_dir, data_file)

if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    print(f"Loaded: {data_file}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.shape[1]}")
    print(f"Rows: {df.shape[0]:,}")
else:
    print(f"Error: {data_file} not found!")
    print(f"Looking in: {notebook_dir}")

Loaded: diabetes_binary_5050split_health_indicators_BRFSS2015.csv
Shape: (70692, 22)
Columns: 22
Rows: 70,692


## 5. Initial Data Exploration

In [7]:
# Display first few rows
print("First 5 rows:")
df.head()

First 5 rows:


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [8]:
# Dataset info
print("Dataset Information:")
df.info()

Dataset Information:
<class 'pandas.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Diabetes_binary       70692 non-null  float64
 1   HighBP                70692 non-null  float64
 2   HighChol              70692 non-null  float64
 3   CholCheck             70692 non-null  float64
 4   BMI                   70692 non-null  float64
 5   Smoker                70692 non-null  float64
 6   Stroke                70692 non-null  float64
 7   HeartDiseaseorAttack  70692 non-null  float64
 8   PhysActivity          70692 non-null  float64
 9   Fruits                70692 non-null  float64
 10  Veggies               70692 non-null  float64
 11  HvyAlcoholConsump     70692 non-null  float64
 12  AnyHealthcare         70692 non-null  float64
 13  NoDocbcCost           70692 non-null  float64
 14  GenHlth               70692 non-null  float64
 15  MentHlth 

In [9]:
# Statistical summary
print("Statistical Summary:")
df.describe()

Statistical Summary:


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0
mean,0.5,0.563458,0.525703,0.975259,29.856985,0.475273,0.062171,0.14781,0.703036,0.611795,0.788774,0.042721,0.95496,0.093914,2.837082,3.752037,5.810417,0.25273,0.456997,8.584055,4.920953,5.698311
std,0.500004,0.49596,0.499342,0.155336,7.113954,0.499392,0.241468,0.354914,0.456924,0.487345,0.408181,0.202228,0.207394,0.291712,1.113565,8.155627,10.062261,0.434581,0.498151,2.852153,1.029081,2.175196
min,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,1.0,25.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,7.0,4.0,4.0
50%,0.5,1.0,1.0,1.0,29.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,9.0,5.0,6.0
75%,1.0,1.0,1.0,1.0,33.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,4.0,2.0,6.0,1.0,1.0,11.0,6.0,8.0
max,1.0,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0


In [10]:
# Check for missing values
print("Missing Values:")
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "No missing values!")

Missing Values:
No missing values!


## 6. Data Visualization

In [11]:
# Placeholder for visualizations
# Example: Distribution of target variable (adjust column name as needed)

# Uncomment and modify based on your actual column names:
# plt.figure(figsize=(10, 6))
# sns.countplot(data=df, x='target_column_name')
# plt.title('Distribution of Target Variable')
# plt.xlabel('Target')
# plt.ylabel('Count')
# plt.show()

print("Ready for custom visualizations!")

Ready for custom visualizations!
