In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import os
from joblib import dump

# Define paths
raw_data_path = '../data/raw/diabetes_012_health_indicators_BRFSS2023.csv'
processed_data_path = '../data/processed/'

# Create processed directory if it doesn't exist
os.makedirs(processed_data_path, exist_ok=True)

# Load the raw dataset
df = pd.read_csv(raw_data_path)

# Handle outliers (cap BMI at 60 based on EDA insights)
df['BMI'] = df['BMI'].clip(upper=60)

# Verify the change
print("BMI Stats after capping:\n", df['BMI'].describe())

# Define feature types
numeric_features = ['BMI', 'MentHlth', 'PhysHlth']
categorical_features = ['Sex', 'AgeGroup', 'Smoker', 'HighBP', 'HighChol', 'CholCheck',
                       'Asthma', 'COPD', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity',
                       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk',
                       'Education', 'Income']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

# Fit and transform the data
X = df.drop(columns=['Diabetes_012'])
y = df['Diabetes_012']
X_processed = preprocessor.fit_transform(X)

# Save the preprocessor
dump(preprocessor, os.path.join(processed_data_path, 'preprocessor.joblib'))

# Save processed data
np.save(os.path.join(processed_data_path, 'X_processed.npy'), X_processed)
np.save(os.path.join(processed_data_path, 'y.npy'), y)

print("Shape of X_processed:", X_processed.shape)
print("Shape of y:", y.shape)

# Split the data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42, stratify=y)

# Save the splits
np.save(os.path.join(processed_data_path, 'X_train.npy'), X_train)
np.save(os.path.join(processed_data_path, 'X_test.npy'), X_test)
np.save(os.path.join(processed_data_path, 'y_train.npy'), y_train)
np.save(os.path.join(processed_data_path, 'y_test.npy'), y_test)

print("Shapes - X_train:", X_train.shape, "X_test:", X_test.shape)
print("Shapes - y_train:", y_train.shape, "y_test:", y_test.shape)

BMI Stats after capping:
 count    261589.000000
mean         28.809484
std           6.435822
min          12.000000
25%          24.000000
50%          28.000000
75%          32.000000
max          60.000000
Name: BMI, dtype: float64
Shape of X_processed: (261589, 48)
Shape of y: (261589,)
Shapes - X_train: (183112, 48) X_test: (78477, 48)
Shapes - y_train: (183112,) y_test: (78477,)


In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import os
from joblib import dump

# Define paths
raw_data_path = '../data/raw/diabetes_binary_5050split_health_indicators_BRFSS2023.csv'
processed_data_path = '../data/processed/'

# Create processed directory if it doesn't exist
os.makedirs(processed_data_path, exist_ok=True)

# Load the raw dataset
df = pd.read_csv(raw_data_path)

# Handle outliers (cap BMI at 60 based on EDA insights)
df['BMI'] = df['BMI'].clip(upper=60)

# Verify the change
print("BMI Stats after capping:\n", df['BMI'].describe())

# Define feature types based on the dataset
numeric_features = ['BMI', 'MentHlth', 'PhysHlth']
categorical_features = ['KidneyDisease', 'HighBP', 'HighChol', 'CholCheck', 'Asthma', 'COPD',
                       'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity',
                       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk',
                       'Sex', 'AgeGroup', 'Education', 'Income']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

# Fit and transform the data
X_balanced = df.drop(columns=['Diabetes_binary'])
y_balanced = df['Diabetes_binary']
X_processed_balanced = preprocessor.fit_transform(X_balanced)

# Save the preprocessor
dump(preprocessor, os.path.join(processed_data_path, 'preprocessor_balanced.joblib'))

# Save processed data
np.save(os.path.join(processed_data_path, 'X_processed_balanced.npy'), X_processed_balanced)
np.save(os.path.join(processed_data_path, 'y_balanced.npy'), y_balanced)

print("Shape of X_processed_balanced:", X_processed_balanced.shape)
print("Shape of y_balanced:", y_balanced.shape)

# Split the data into train and test sets
from sklearn.model_selection import train_test_split
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(X_processed_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

# Save the splits
np.save(os.path.join(processed_data_path, 'X_train_balanced.npy'), X_train_balanced)
np.save(os.path.join(processed_data_path, 'X_test_balanced.npy'), X_test_balanced)
np.save(os.path.join(processed_data_path, 'y_train_balanced.npy'), y_train_balanced)
np.save(os.path.join(processed_data_path, 'y_test_balanced.npy'), y_test_balanced)

print("Shapes - X_train_balanced:", X_train_balanced.shape, "X_test_balanced:", X_test_balanced.shape)
print("Shapes - y_train_balanced:", y_train_balanced.shape, "y_test_balanced:", y_test_balanced.shape)

BMI Stats after capping:
 count    71814.000000
mean        30.158005
std          6.925495
min         12.000000
25%         25.000000
50%         29.000000
75%         34.000000
max         60.000000
Name: BMI, dtype: float64
Shape of X_processed_balanced: (71814, 51)
Shape of y_balanced: (71814,)
Shapes - X_train_balanced: (50269, 51) X_test_balanced: (21545, 51)
Shapes - y_train_balanced: (50269,) y_test_balanced: (21545,)
