In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample, shuffle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, make_scorer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [None]:
ordinal = ['_INCOMG', '_LMTSCL1', '_AGE_G', 'GENHLTH', '_SMOKER3', 'USENOW3', '_ASTHMS1', '_BMI5CAT', '_CHLDCNT', '_CHOLCHK', '_EDUCAG', '_LMTACT1', '_LMTWRK1', '_PA150R2', '_PA300R2', '_PACAT1', 'CHECKUP1', 'CHOLCHK', 'EDUCA']
should_be_continuous = ['_AGE80']
should_drop = ['QSTLANG', '_PSU', '_FRT16', '_FRTRESP', '_FRUITEX', '_MISFRTN', '_MISVEGN', 'DISPCODE']

In [2]:
# categorical: missing values will be replaced with the mode.
categorical = [
    "HAVARTH3",
    "HLTHPLN1",
    "MARITAL",
    "MEDCOST",
    "PERSDOC2",
    "CHCCOPD1",
    "_INCOMG",
    "TOLDHI2",
    "DIFFWALK",
    "_LMTSCL1",
    "_RFCHOL",
    "QLACTLM2",
    "PNEUVAC3",
    "CVDSTRK3",
    "DIABETE3",
    "_AGE65YR",
    "_RFHYPE5",
    "_RFHLTH",
    "SEX",
    "_DRDXAR1",
    "BPHIGH4",
    "_AGE_G",
    "_AGE80",
    "GENHLTH",
    "_HCVU651",
    "EMPLOY1",
    "_RACE",
    "_RACEG21",
    "_RACEGR3",
    "_RACE_G1",
    "_RFBING5",
    "_RFBMI5",
    "_RFDRHV5",
    "_RFSEAT2",
    "_RFSEAT3",
    "_RFSMOK3",
    "_SMOKER3",
    "_TOTINDA",
    "_VEGLT1",
    "QSTLANG",
    "RENTHOM1",
    "SMOKE100",
    "USEEQUIP",
    "USENOW3",
    "VETERAN3",
    "_AIDTST3",
    "_ASTHMS1",
    "_BMI5CAT",
    "_CHISPNC",
    "_CHLDCNT",
    "_CHOLCHK",
    "_EDUCAG",
    "_FRT16",
    "_FRTLT1",
    "_FRTRESP",
    "_FRUITEX",
    "_HISPANC",
    "_LMTACT1",
    "_LMTWRK1",
    "_LTASTH1",
    "_MISFRTN",
    "_MISVEGN",
    "_MRACE1",
    "_PA150R2",
    "_PA30021",
    "_PA300R2",
    "_PACAT1",
    "_PAINDX1",
    "_PAREC1",
    "_PASTAE1",
    "_PASTRNG",
    "_PRACE1",
    "ADDEPEV2",
    "ASTHMA3",
    "BLIND",
    "BLOODCHO",
    "CHCKIDNY",
    "CHCOCNCR",
    "CHCSCNCR",
    "CHECKUP1",
    "CHOLCHK",
    "DECIDE",
    "DIFFALON",
    "DIFFDRES",
    "DISPCODE",
    "DRNKANY5",
    "EDUCA",
    "EXERANY2",
    "FLUSHOT6",
]
# continuous: missing values will be replaced with the median. These need to be scaled.
continuous = [
    "GRENDAY_",
    "MENTHLTH",
    "ORNGDAY_",
    "HTIN4",
    "WTKG3",
    "FC60_",
    "CHILDREN",
    "MAXVO2_",
    "PHYSHLTH",
    "_VEGESUM",
    "_DRNKWEK",
    "STRFREQ_",
    "VEGEDA1_",
    "VEGETAB1",
    "STRENGTH",
    "_BMI5",
    "DROCDY3_",
    "BEANDAY_",
    "ALCDAY5",
    "FRUIT1",
]

target = ["_MICHD"]

In [None]:

for item in ordinal:
    if item in categorical:
        categorical.remove(item)
    if item not in continuous:
        continuous.append(item)
for item in should_be_continuous:
    if item not in continuous:
        continuous += [item]
    if item in categorical:
        categorical.remove(item)
categorical

In [None]:
print(continuous)

In [3]:
# file_path = '../heart_disease_health_indicators_BRFSS2015.csv'
file_path = '../../2015_clean.csv'
# file_path = 'data_2015_dropped_filtered_1.csv'
# file_path = 'data_2015_filtered_1.csv'
data = pd.read_csv(file_path)

In [4]:
# TARGET = 'HeartDiseaseorAttack'
TARGET = '_MICHD'
# data = data.drop(['PhysHlth'], axis=1)
# Separate the majority and minority classes for balancing
data.dropna(subset=TARGET, inplace=True)
y = data[TARGET]
X = data.drop([TARGET], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [5]:
X[categorical].dtypes

HAVARTH3    float64
HLTHPLN1    float64
MARITAL     float64
MEDCOST     float64
PERSDOC2    float64
             ...   
DISPCODE    float64
DRNKANY5    float64
EDUCA       float64
EXERANY2    float64
FLUSHOT6    float64
Length: 89, dtype: object

In [None]:
X['_INCOMG'].value_counts()

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

cat_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(sparse=False, drop='first')),
    ]
)


num_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        # ("std_scaler", StandardScaler()),
    ]
)

full_pipeline = ColumnTransformer(
    [
        ("num", num_pipeline, continuous),
        ("cat", cat_pipeline, categorical),
    ]
)

In [None]:
X_train

In [7]:
full_pipeline.fit(X_train)
# Transform the training data and convert it back to a DataFrame with column names
X_train = pd.DataFrame(
    full_pipeline.transform(X_train),
    columns=continuous + full_pipeline.named_transformers_["cat"].get_feature_names_out(categorical).tolist(),
)
X_test = pd.DataFrame(
    full_pipeline.transform(X_test),
    columns=continuous + full_pipeline.named_transformers_["cat"].get_feature_names_out(categorical).tolist(),
)



In [8]:
X_train

Unnamed: 0,GRENDAY_,MENTHLTH,ORNGDAY_,HTIN4,WTKG3,FC60_,CHILDREN,MAXVO2_,PHYSHLTH,_VEGESUM,_DRNKWEK,STRFREQ_,VEGEDA1_,VEGETAB1,STRENGTH,_BMI5,DROCDY3_,BEANDAY_,ALCDAY5,FRUIT1,HAVARTH3_1.0,HLTHPLN1_1.0,MARITAL_2.0,MARITAL_3.0,MARITAL_4.0,MARITAL_5.0,MARITAL_6.0,MEDCOST_1.0,PERSDOC2_2.0,PERSDOC2_3.0,CHCCOPD1_1.0,_INCOMG_2.0,_INCOMG_3.0,_INCOMG_4.0,_INCOMG_5.0,TOLDHI2_2.0,DIFFWALK_1.0,_LMTSCL1_2.0,_LMTSCL1_3.0,_LMTSCL1_4.0,_RFCHOL_1.0,QLACTLM2_1.0,PNEUVAC3_1.0,CVDSTRK3_1.0,DIABETE3_2.0,DIABETE3_3.0,DIABETE3_4.0,_AGE65YR_2.0,_RFHYPE5_1.0,_RFHLTH_2.0,SEX_1.0,_DRDXAR1_1.0,BPHIGH4_2.0,BPHIGH4_3.0,BPHIGH4_4.0,_AGE_G_2.0,_AGE_G_3.0,_AGE_G_4.0,_AGE_G_5.0,_AGE_G_6.0,_AGE80_19.0,_AGE80_20.0,_AGE80_21.0,_AGE80_22.0,_AGE80_23.0,_AGE80_24.0,_AGE80_25.0,_AGE80_26.0,_AGE80_27.0,_AGE80_28.0,_AGE80_29.0,_AGE80_30.0,_AGE80_31.0,_AGE80_32.0,_AGE80_33.0,_AGE80_34.0,_AGE80_35.0,_AGE80_36.0,_AGE80_37.0,_AGE80_38.0,_AGE80_39.0,_AGE80_40.0,_AGE80_41.0,_AGE80_42.0,_AGE80_43.0,_AGE80_44.0,_AGE80_45.0,_AGE80_46.0,_AGE80_47.0,_AGE80_48.0,_AGE80_49.0,_AGE80_50.0,_AGE80_51.0,_AGE80_52.0,_AGE80_53.0,_AGE80_54.0,_AGE80_55.0,_AGE80_56.0,_AGE80_57.0,_AGE80_58.0,_AGE80_59.0,_AGE80_60.0,_AGE80_61.0,_AGE80_62.0,_AGE80_63.0,_AGE80_64.0,_AGE80_65.0,_AGE80_66.0,_AGE80_67.0,_AGE80_68.0,_AGE80_69.0,_AGE80_70.0,_AGE80_71.0,_AGE80_72.0,_AGE80_73.0,_AGE80_74.0,_AGE80_75.0,_AGE80_76.0,_AGE80_77.0,_AGE80_78.0,_AGE80_79.0,_AGE80_80.0,GENHLTH_2.0,GENHLTH_3.0,GENHLTH_4.0,GENHLTH_5.0,_HCVU651_1.0,EMPLOY1_2.0,EMPLOY1_3.0,EMPLOY1_4.0,EMPLOY1_5.0,EMPLOY1_6.0,EMPLOY1_7.0,EMPLOY1_8.0,_RACE_2.0,_RACE_3.0,_RACE_4.0,_RACE_5.0,_RACE_6.0,_RACE_7.0,_RACE_8.0,_RACEG21_2.0,_RACEGR3_2.0,_RACEGR3_3.0,_RACEGR3_4.0,_RACEGR3_5.0,_RACE_G1_2.0,_RACE_G1_3.0,_RACE_G1_4.0,_RACE_G1_5.0,_RFBING5_2.0,_RFBMI5_2.0,_RFDRHV5_2.0,_RFSEAT2_2.0,_RFSEAT3_2.0,_RFSMOK3_2.0,_SMOKER3_2.0,_SMOKER3_3.0,_SMOKER3_4.0,_TOTINDA_2.0,_VEGLT1_2.0,QSTLANG_2.0,RENTHOM1_2.0,RENTHOM1_3.0,SMOKE100_2.0,USEEQUIP_2.0,USENOW3_2.0,USENOW3_3.0,VETERAN3_2.0,_AIDTST3_2.0,_ASTHMS1_2.0,_ASTHMS1_3.0,_BMI5CAT_2.0,_BMI5CAT_3.0,_BMI5CAT_4.0,_CHISPNC_2.0,_CHLDCNT_2.0,_CHLDCNT_3.0,_CHLDCNT_4.0,_CHLDCNT_5.0,_CHLDCNT_6.0,_CHOLCHK_2.0,_CHOLCHK_3.0,_EDUCAG_2.0,_EDUCAG_3.0,_EDUCAG_4.0,_FRT16_1.0,_FRTLT1_1.0,_FRTRESP_1.0,_FRUITEX_1.0,_FRUITEX_2.0,_HISPANC_1.0,_LMTACT1_2.0,_LMTACT1_3.0,_LMTWRK1_2.0,_LMTWRK1_3.0,_LTASTH1_1.0,_MISFRTN_1.0,_MISFRTN_2.0,_MISVEGN_1.0,_MISVEGN_2.0,_MISVEGN_3.0,_MISVEGN_4.0,_MRACE1_2.0,_MRACE1_3.0,_MRACE1_4.0,_MRACE1_5.0,_MRACE1_6.0,_MRACE1_7.0,_PA150R2_2.0,_PA150R2_3.0,_PA30021_2.0,_PA300R2_2.0,_PA300R2_3.0,_PACAT1_2.0,_PACAT1_3.0,_PACAT1_4.0,_PAINDX1_1.0,_PAREC1_2.0,_PAREC1_3.0,_PAREC1_4.0,_PASTAE1_1.0,_PASTRNG_1.0,_PRACE1_2.0,_PRACE1_3.0,_PRACE1_4.0,_PRACE1_5.0,_PRACE1_6.0,_PRACE1_7.0,_PRACE1_8.0,ADDEPEV2_1.0,ASTHMA3_1.0,BLIND_1.0,BLOODCHO_1.0,CHCKIDNY_1.0,CHCOCNCR_1.0,CHCSCNCR_1.0,CHECKUP1_2.0,CHECKUP1_3.0,CHECKUP1_4.0,CHECKUP1_8.0,CHOLCHK_2.0,CHOLCHK_3.0,CHOLCHK_4.0,DECIDE_1.0,DIFFALON_1.0,DIFFDRES_1.0,DISPCODE_1200.0,DRNKANY5_1.0,EDUCA_2.0,EDUCA_3.0,EDUCA_4.0,EDUCA_5.0,EDUCA_6.0,EXERANY2_1.0,FLUSHOT6_1.0
0,3.000000e+00,0.0,5.397605e-79,71.0,8618.0,519.0,0.0,3030.0,0.0,6.0,5.397605e-79,5.397605e-79,5.397605e-79,555.0,13.0,2650.0,5.397605e-79,3.000000e+00,0.00,60.00,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2.900000e+01,0.0,4.300000e+01,70.0,9525.0,425.0,0.0,2480.0,0.0,172.0,4.700000e+01,5.397605e-79,1.000000e+02,101.0,13.0,3013.0,3.000000e+00,5.397605e-79,1.00,30.00,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,1.000000e+02,0.0,4.000000e+01,66.0,7711.0,373.0,0.0,2173.0,21.0,311.0,5.397605e-79,5.397605e-79,1.000000e+02,330.0,13.0,2744.0,5.397605e-79,7.100000e+01,0.00,0.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,5.397605e-79,30.0,5.397605e-79,63.0,7348.0,487.0,0.0,2839.0,30.0,43.0,7.000000e+02,5.397605e-79,4.300000e+01,203.0,13.0,2870.0,1.000000e+02,5.397605e-79,30.31,12.99,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
4,5.397605e-79,0.0,5.397605e-79,69.0,7484.0,265.0,0.0,1545.0,0.0,27.0,5.397605e-79,2.330000e+02,1.000000e+01,303.0,1.0,2437.0,5.397605e-79,1.700000e+01,0.00,0.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54081,5.397605e-79,0.0,5.397605e-79,74.0,10433.0,633.0,0.0,3690.0,30.0,100.0,5.397605e-79,3.000000e+03,5.397605e-79,555.0,13.0,2953.0,5.397605e-79,1.000000e+02,0.00,3.00,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
54082,1.300000e+01,7.0,1.300000e+01,69.0,7484.0,802.0,0.0,4680.0,4.0,149.0,5.397605e-79,9.330000e+02,1.000000e+02,101.0,4.0,2437.0,4.300000e+01,2.300000e+01,12.99,0.00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
54083,6.700000e+01,10.0,4.000000e+01,67.0,7938.0,417.0,0.0,2432.0,0.0,223.0,5.397605e-79,5.397605e-79,8.300000e+01,325.0,13.0,2744.0,5.397605e-79,3.300000e+01,0.00,20.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
54084,4.300000e+01,0.0,7.100000e+01,73.0,7484.0,453.0,0.0,2645.0,0.0,328.0,2.000000e+02,5.397605e-79,2.000000e+02,102.0,13.0,2177.0,1.400000e+01,1.400000e+01,4.33,30.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
for col in categorical:
    X_train[col] = pd.factorize(X_train[col])[0]
    X_test[col] = pd.factorize(X_test[col])[0]

In [None]:
X_train

### 

In [None]:
X_train[categorical] = X_train[categorical].round()
X_train[continuous] = X_train[continuous].round(3)
for col in categorical:
    category_order = sorted(X_train[col].unique())
    X_train[col] = pd.Categorical(X_train[col], categories=category_order, ordered=False)

X_test[categorical] = X_test[categorical].round()
X_test[continuous] = X_test[continuous].round(3)
# X_test[categorical] = X_test[categorical].astype('category')
for col in categorical:
    category_order = sorted(X_test[col].unique())
    X_test[col] = pd.Categorical(X_test[col], categories=category_order, ordered=False)
# X_test.dtypes[:-40]

In [None]:
X_train

In [9]:
clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
importances = clf.feature_importances_

In [10]:
feature_list = list(X_train.columns)
feature_importance = pd.DataFrame({'Feature': feature_list, 'Importance': importances})
sorted_importance = feature_importance.sort_values('Importance', ascending=False)
sorted_importance

Unnamed: 0,Feature,Importance
49,_RFHLTH_2.0,0.032672
53,BPHIGH4_3.0,0.032290
48,_RFHYPE5_1.0,0.030920
5,FC60_,0.021209
47,_AGE65YR_2.0,0.021184
...,...,...
63,_AGE80_22.0,0.000052
60,_AGE80_19.0,0.000050
186,_FRT16_1.0,0.000049
61,_AGE80_20.0,0.000037


In [11]:
l = list(sorted_importance['Feature'])
l

['_RFHLTH_2.0',
 'BPHIGH4_3.0',
 '_RFHYPE5_1.0',
 'FC60_',
 '_AGE65YR_2.0',
 '_AGE_G_6.0',
 'MAXVO2_',
 'DIFFWALK_1.0',
 'EMPLOY1_7.0',
 'QLACTLM2_1.0',
 'DIABETE3_3.0',
 'SEX_1.0',
 'CVDSTRK3_1.0',
 'PHYSHLTH',
 'PNEUVAC3_1.0',
 '_RFCHOL_1.0',
 'USEEQUIP_2.0',
 'CHCCOPD1_1.0',
 'TOLDHI2_2.0',
 'HTIN4',
 'WTKG3',
 '_BMI5',
 'GENHLTH_2.0',
 'GENHLTH_3.0',
 'BEANDAY_',
 '_VEGESUM',
 'VEGETAB1',
 'GRENDAY_',
 'ORNGDAY_',
 '_DRDXAR1_1.0',
 'FRUIT1',
 'HAVARTH3_1.0',
 'VEGEDA1_',
 '_AGE80_80.0',
 'GENHLTH_4.0',
 'VETERAN3_2.0',
 '_LMTACT1_3.0',
 '_INCOMG_5.0',
 'FLUSHOT6_1.0',
 'ALCDAY5',
 'DROCDY3_',
 '_DRNKWEK',
 'MENTHLTH',
 '_FRTLT1_1.0',
 'DRNKANY5_1.0',
 '_BMI5CAT_3.0',
 'GENHLTH_5.0',
 '_SMOKER3_3.0',
 '_SMOKER3_4.0',
 'SMOKE100_2.0',
 'EMPLOY1_8.0',
 'STRFREQ_',
 '_BMI5CAT_4.0',
 'MARITAL_3.0',
 'ADDEPEV2_1.0',
 'STRENGTH',
 'CHCOCNCR_1.0',
 '_CHOLCHK_3.0',
 '_VEGLT1_2.0',
 '_AGE_G_5.0',
 'RENTHOM1_2.0',
 'EDUCA_5.0',
 '_EDUCAG_2.0',
 'EDUCA_4.0',
 '_EDUCAG_4.0',
 '_AIDTST3_2.0',
 '

In [None]:
X_train.dtypes

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

def evaluate_model(model, name, X_train, X_test, y_train, y_test):
    if name == "LogisticRegression":
        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
        model.fit(X_train_scaled, y_train)
        X_test_scaled = pd.DataFrame(scaler.fit_transform(X_test), columns=X_train.columns)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"{name}:", f1)
    return f1

models = {
    "CatBoostClassifier": CatBoostClassifier(
        silent=True, 
        random_seed=42, 
        iterations=1000, 
        depth=4, 
        # learning_rate=0.1,
        # cat_features=categorical
    ),
    "AdaBoostClassifier": AdaBoostClassifier(
        n_estimators=200, 
        # learning_rate=1.0
    ),
    "RandomForestClassifier": RandomForestClassifier(
        n_estimators=1000, 
        # max_features='sqrt', 
        # min_samples_split=10, 
        # min_samples_leaf=4, 
        random_state=42
    ),
    "GradientBoostingClassifier": GradientBoostingClassifier(
        n_estimators=150, 
        # learning_rate=0.1, 
        max_depth=4
    ),
    "LGBMClassifier": LGBMClassifier(
        num_leaves=31, 
        max_depth=-1, 
        learning_rate=0.1, 
        verbose=-1
    ),
    # "LogisticRegression": LogisticRegression(max_iter=1000),
    # "SVC": SVC(),
    # "KNeighborsClassifier": KNeighborsClassifier(),
    # "GaussianNB": GaussianNB(),
    # "MLPClassifier": MLPClassifier(hidden_layer_sizes=(90), random_state=42),
    # "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    # "ExtraTreesClassifier": ExtraTreesClassifier(n_estimators=100)
}

results = {name: evaluate_model(model, name, X_train[l], X_test[l], y_train, y_test) for name, model in models.items()}
sorted_results = dict(sorted(results.items(), key=lambda item: item[1], reverse=True))
sorted_results

CatBoostClassifier: 0.7862253650780366
AdaBoostClassifier: 0.7779441331222997
RandomForestClassifier: 0.7839187683163412
GradientBoostingClassifier: 0.7870432450553134
LGBMClassifier: 0.7882882882882883


{'LGBMClassifier': 0.7882882882882883,
 'GradientBoostingClassifier': 0.7870432450553134,
 'CatBoostClassifier': 0.7862253650780366,
 'RandomForestClassifier': 0.7839187683163412,
 'AdaBoostClassifier': 0.7779441331222997}

In [None]:
pydata = X_train
pydata['_MICHD'] = y

In [None]:
y.value_counts()

In [None]:
from pycaret.classification import *
import pandas as pd
from sklearn.model_selection import train_test_split

train_data, unseen_data = train_test_split(pydata, test_size=0.2, random_state=123)
exp_clf = setup(data = train_data, target = TARGET, session_id=123)

best_model = compare_models()

In [None]:
rf = create_model('rf')

tuned_rf = tune_model(rf)

evaluate_model(tuned_rf)

predictions = predict_model(tuned_rf, data = unseen_data)

save_model(tuned_rf, 'my_random_forest_model')

In [None]:
# To load the model, you would use:
loaded_rf = load_model('my_random_forest_model')