In [2]:
# Import necessary libraries
import kagglehub
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

# Step 1: Download the dataset from Kaggle
# Replace with the correct Kaggle dataset ID
path = kagglehub.dataset_download("alexteboul/diabetes-health-indicators-dataset")

# Step 2: Locate the downloaded CSV file
dataset_file = next(
    (os.path.join(path, file_name) for file_name in os.listdir(path) if file_name.endswith(".csv")),
    None
)

# Step 3: Verify if a CSV file was found
if dataset_file is None:
    raise FileNotFoundError("No CSV file found in the downloaded dataset.")
else:
    print(f"CSV file found: {dataset_file}")


CSV file found: /Users/david/.cache/kagglehub/datasets/alexteboul/diabetes-health-indicators-dataset/versions/1/diabetes_012_health_indicators_BRFSS2015.csv


In [3]:
## Import necessary libraries
import pandas as pd

# Load the dataset into a DataFrame
df = pd.read_csv(dataset_file)

# 1. Explore the dataset

print(f"\nShape of the dataset: {df.shape}")  # Dataset shape

print("\nDataset info:")
print(df.info())  # Data types and non-null counts

print("\nMissing values per column:")
print(df.isnull().sum())  # Missing values count

duplicates_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates_count}")  # Duplicate rows count

print("\nDescriptive statistics for numerical columns:")
print(df.describe())  # Summary stats for numerical columns

if 'Diabetes_012' in df.columns:
    print("\nValue counts for the target column 'Diabetes_012':")
    print(df['Diabetes_012'].value_counts())  # Target column distribution

# 2. Clean the dataset
if duplicates_count > 0:
    df = df.drop_duplicates()
    print(f"\nRemoved {duplicates_count} duplicate rows. New shape: {df.shape}")

missing_values_count = df.isnull().sum().sum()
if missing_values_count > 0:
    df = df.dropna()
    print(f"\nRemoved rows with missing values. New shape: {df.shape}")

df.columns = df.columns.str.lower().str.replace(' ', '_')
print("\nStandardized column names:")
print(df.columns) 





Shape of the dataset: (253680, 22)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_012          253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  G

In [4]:
# Remove outliers only for the columns 'bmi' and 'age' as they are continuous features
columns_to_clean = ['bmi', 'age']

for col in columns_to_clean:
    if col in df.columns:  # Verificar que la columna exista en el DataFrame
        q1 = df[col].quantile(0.25)  # First quartile
        q3 = df[col].quantile(0.75)  # Third quartile
        iqr = q3 - q1  # Interquartile range

        # Lower and upper bounds
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        # Filter the dataset to keep only values within the bounds
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
        print(f"Removed outliers in '{col}'. New shape: {df.shape}")
    else:
        print(f"Column '{col}' does not exist in the DataFrame.")

print(f"Dataset after removing outliers in 'bmi' and 'age': {df.shape}")
 

Removed outliers in 'bmi'. New shape: (224143, 22)
Removed outliers in 'age'. New shape: (224143, 22)
Dataset after removing outliers in 'bmi' and 'age': (224143, 22)


In [5]:
# MinMacScaler es mejor para podelos de tandos de arboles decicicion  


from sklearn.preprocessing import MinMaxScaler  

# List of continuous features to be scaled
continuous_features = ['age', 'bmi', 'income', 'menthlth', 'physhlth', 'education']  # Ajustar si es necesario

# MinMaxScaler (Escala entre 0 y 1)
scaler = MinMaxScaler()

# Escalar directamente en el DataFrame `df` para que los cambios se reflejen
if set(continuous_features).issubset(df.columns):  # Verificar que las columnas existan
    df[continuous_features] = scaler.fit_transform(df[continuous_features])
    print("\nScaled continuous features to [0, 1] range:")
    print(df[continuous_features].describe())
else:
    missing_cols = set(continuous_features) - set(df.columns)
    print(f"Las siguientes columnas faltan en el DataFrame: {missing_cols}")





Scaled continuous features to [0, 1] range:
                 age            bmi         income       menthlth  \
count  224143.000000  224143.000000  224143.000000  224143.000000   
mean        0.592498       0.502413       0.701782       0.114293   
std         0.258385       0.169065       0.297463       0.254099   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.416667       0.375000       0.571429       0.000000   
50%         0.583333       0.468750       0.714286       0.000000   
75%         0.750000       0.593750       1.000000       0.066667   
max         1.000000       1.000000       1.000000       1.000000   

            physhlth      education  
count  224143.000000  224143.000000  
mean        0.151910       0.797090  
std         0.297910       0.198368  
min         0.000000       0.000000  
25%         0.000000       0.600000  
50%         0.000000       0.800000  
75%         0.133333       1.000000  
max         1.000000       1.

In [6]:
# Exportar el DataFrame 'df' limpio y procesado a un archivo CSV
output_file_path = "cleaned_and_scaled_dataset.csv"  # Cambia el nombre si lo deseas

# Guardar el DataFrame en un archivo CSV
df.to_csv(output_file_path, index=False)

print(f"El archivo CSV ha sido guardado exitosamente en: {output_file_path}")
 

El archivo CSV ha sido guardado exitosamente en: cleaned_and_scaled_dataset.csv
