# Data Preparation Step

* Import the needed libraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

* Load dataset from a CSV file

In [4]:
def load_data(file_path):
    return pd.read_csv(file_path)

* Split the dataset into features (X) and target (y)

In [5]:
def split_features_target(df, target_column):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

* Split dataset into training, validation, and test sets while maintaining class distribution

In [6]:
def split_train_validation_test(X, y, train_size=0.7, test_size=0.2, val_size=0.1, random_state=42):
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=(1 - train_size), stratify=y, random_state=random_state
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=(test_size / (test_size + val_size)), stratify=y_temp, random_state=random_state
    )
    return X_train, X_val, X_test, y_train, y_val, y_test

* Print class distributions for training, validation, and test sets

In [11]:
def print_class_distribution(y_train, y_val, y_test):
    print("Training Class Distribution:\n", y_train.value_counts(normalize=True))
    print("Validation Class Distribution:\n", y_val.value_counts(normalize=True))
    print("Test Class Distribution:\n", y_test.value_counts(normalize=True))

* Perform one-hot encoding for categorical columns

In [7]:
def encode_categorical_columns(df, categorical_columns):
    return pd.get_dummies(df, columns=categorical_columns, dtype='uint8')

* Standardize numerical features using StandardScaler

In [8]:
def standardize_features(df):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

* Main

In [17]:
def main():
    # Load dataset
    df = load_data("heart.csv")
    #print(df.head())
    
    # Split features and target
    X, y = split_features_target(df, 'HeartDisease')
    
    # Define categorical columns to encode
    categorical_columns = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
    df_encoded = encode_categorical_columns(X, categorical_columns)
    # print("Encoded Columns:", df_encoded.columns)

    # Split data into train, validation, and test sets
    X_train, X_val, X_test, y_train, y_val, y_test = split_train_validation_test(df_encoded, y)
    
    # Validate class distribution
    print_class_distribution(y_train, y_val, y_test)
        
    # if needed, Standardize dataset
    # df_standardized = standardize_features(df_encoded)
    # print(df_standardized.head())

    with open('dataset.pkl', 'wb') as f:
        pickle.dump((X_train, y_train, X_val, y_val, X_test, y_test), f)

if __name__ == "__main__":
    main()

Training Class Distribution:
 HeartDisease
1    0.55296
0    0.44704
Name: proportion, dtype: float64
Validation Class Distribution:
 HeartDisease
1    0.554348
0    0.445652
Name: proportion, dtype: float64
Test Class Distribution:
 HeartDisease
1    0.554348
0    0.445652
Name: proportion, dtype: float64
