In [15]:
import pandas as pd
import kagglehub
from sklearn.model_selection import train_test_split

print("Downloading dataset for manual preprocessing...")
path = kagglehub.dataset_download("redwankarimsony/heart-disease-data")
file_path = f'{path}/heart_disease_uci.csv'
df = pd.read_csv(file_path)
print("Dataset loaded successfully for manual preprocessing.")

X = df.drop('num', axis=1)
y = df['num']

X = X.drop(['id', 'dataset'], axis=1)

categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
numerical_features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Downloading dataset for manual preprocessing...
Using Colab cache for faster access to the 'heart-disease-data' dataset.
Dataset loaded successfully for manual preprocessing.


In [16]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd


num_imputer_train = SimpleImputer(strategy='mean')
X_train_numerical_imputed = pd.DataFrame(num_imputer_train.fit_transform(X_train[numerical_features]),
                                           columns=numerical_features, index=X_train.index)

num_imputer_test = SimpleImputer(strategy='mean')
num_imputer_test.fit(X_train[numerical_features])
X_test_numerical_imputed = pd.DataFrame(num_imputer_test.transform(X_test[numerical_features]),
                                          columns=numerical_features, index=X_test.index)

schaler = StandardScaler()
X_train_numerical_scaled = pd.DataFrame(schaler.fit_transform(X_train_numerical_imputed),
                                        columns=numerical_features, index=X_train.index)

X_test_numerical_scaled = pd.DataFrame(schaler.transform(X_test_numerical_imputed),
                                       columns=numerical_features, index=X_test.index)

In [17]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

cat_imputer_train = SimpleImputer(strategy='most_frequent')
X_train_categorical_imputed = pd.DataFrame(cat_imputer_train.fit_transform(X_train[categorical_features]),
                                             columns=categorical_features, index=X_train.index)

cat_imputer_test = SimpleImputer(strategy='most_frequent')
cat_imputer_test.fit(X_train[categorical_features])
X_test_categorical_imputed = pd.DataFrame(cat_imputer_test.transform(X_test[categorical_features]),
                                            columns=categorical_features, index=X_test.index)

encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
X_train_categorical_encoded = pd.DataFrame(encoder.fit_transform(X_train_categorical_imputed),
                                           columns=encoder.get_feature_names_out(categorical_features),
                                           index=X_train.index)

X_test_categorical_encoded = pd.DataFrame(encoder.transform(X_test_categorical_imputed),
                                          columns=encoder.get_feature_names_out(categorical_features),
                                          index=X_test.index)

In [18]:
from sklearn.linear_model import LogisticRegression
import pandas as pd

X_train_processed = pd.concat([X_train_numerical_scaled, X_train_categorical_encoded], axis=1)
X_test_processed = pd.concat([X_test_numerical_scaled, X_test_categorical_encoded], axis=1)

lr_model_manual = LogisticRegression(random_state=42, max_iter=1000)
lr_model_manual.fit(X_train_processed, y_train)

y_pred_lr_manual = lr_model_manual.predict(X_test_processed)

In [19]:
from sklearn.metrics import classification_report

print("--- Logistic Regression Performance (Manual Preprocessing) ---")
print(classification_report(y_test, y_pred_lr_manual, zero_division=0))

--- Logistic Regression Performance (Manual Preprocessing) ---
              precision    recall  f1-score   support

           0       0.80      0.85      0.83        82
           1       0.46      0.57      0.51        53
           2       0.38      0.14      0.20        22
           3       0.22      0.24      0.23        21
           4       0.00      0.00      0.00         6

    accuracy                           0.59       184
   macro avg       0.37      0.36      0.35       184
weighted avg       0.56      0.59      0.57       184

