In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib # For saving/loading preprocessors later

# Load the diabetes dataset from the raw folder
file_path = '../data/raw/diabetes_prediction_dataset.csv'
diabetes_df = pd.read_csv(file_path)

print("Libraries imported and dataset loaded successfully for preprocessing.")
print(f"Initial dataset shape: {diabetes_df.shape}")

Libraries imported and dataset loaded successfully for preprocessing.
Initial dataset shape: (100000, 9)


In [11]:
print("\nDataset Info (for preprocessing check):")
diabetes_df.info()

print("\nMissing values check (should be mostly zeros):")
print(diabetes_df.isnull().sum())


Dataset Info (for preprocessing check):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB

Missing values check (should be mostly zeros):
gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
bloo

In [12]:
# Identify target variable(s)
# For now, we focus on 'diabetes' as our main target for this model.
target = 'diabetes'

# Identify features (all columns except the target)
features = diabetes_df.drop(columns=[target])

# Separate numerical and categorical features for different preprocessing
numerical_features = features.select_dtypes(include=np.number).columns.tolist()
categorical_features = features.select_dtypes(include='object').columns.tolist()

print(f"Target variable: {target}")
print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

Target variable: diabetes
Numerical features: ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level']
Categorical features: ['gender', 'smoking_history']


In [13]:
# Create preprocessing pipeline for numerical features: just scaling
# StandardScaler transforms numerical features to have a mean of 0 and a standard deviation of 1.
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create preprocessing pipeline for categorical features: one-hot encoding
# OneHotEncoder converts categorical text labels into a numerical (binary) format
# handle_unknown='ignore' prevents errors if an unexpected category appears during prediction
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

print("Preprocessing transformers defined for numerical (StandardScaler) and categorical (OneHotEncoder) data.")

Preprocessing transformers defined for numerical (StandardScaler) and categorical (OneHotEncoder) data.


In [14]:
# Create a preprocessor using ColumnTransformer to apply different transformations to different columns
# 'num' applies numerical_transformer to numerical_features
# 'cat' applies categorical_transformer to categorical_features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("ColumnTransformer defined to apply specific preprocessing steps to numerical and categorical features.")

ColumnTransformer defined to apply specific preprocessing steps to numerical and categorical features.


In [15]:
# Separate features (X) and target (y)
X = diabetes_df.drop(columns=[target]) # X contains all features
y = diabetes_df[target] # y contains the target variable 'diabetes'

# Split the data into training and testing sets
# test_size=0.20 means 20% of the data will be used for testing, 80% for training
# random_state ensures reproducibility (you get the same split every time you run it)
# stratify=y ensures that the proportion of '0' and '1' in the 'diabetes' target variable
# is maintained similarly in both training and testing sets. This is CRUCIAL for imbalanced datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

print(f"Data split into training and testing sets:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train distribution:\n{y_train.value_counts(normalize=True)}")
print(f"y_test distribution:\n{y_test.value_counts(normalize=True)}")

Data split into training and testing sets:
X_train shape: (80000, 8)
X_test shape: (20000, 8)
y_train distribution:
diabetes
0    0.915
1    0.085
Name: proportion, dtype: float64
y_test distribution:
diabetes
0    0.915
1    0.085
Name: proportion, dtype: float64


In [16]:
# Assuming your main DataFrame is diabetes_df (or health_df for hypertension model)
# Make sure this cell runs AFTER you load the dataset in your notebook

print("Distribution of 'smoking_history' in the dataset:")
print(diabetes_df['smoking_history'].value_counts())

Distribution of 'smoking_history' in the dataset:
smoking_history
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64


Preprocessor or model_pipeline not found or not fitted yet. Please run preceding cells.
