In [1]:
# Imports
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [30]:
df = pd.read_csv('heart_disease_health_indicators_BRFSS2015.csv')
# Scale data and split into X and y then test and train
X = df.iloc[:, 1:]
y = df.iloc[:, 0]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.3, random_state=42)

In [31]:
X_train

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
51457,1.0,1.0,1.0,0.267442,0.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.00,0.0,0.466667,1.0,0.0,0.916667,0.4,0.000000
104586,1.0,1.0,1.0,0.290698,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.50,0.0,0.066667,0.0,1.0,0.833333,0.8,0.285714
135715,0.0,0.0,1.0,0.186047,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.25,0.0,0.000000,0.0,0.0,0.333333,1.0,1.000000
229271,0.0,0.0,1.0,0.127907,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.50,0.0,0.000000,0.0,1.0,0.083333,1.0,1.000000
107891,0.0,0.0,1.0,0.209302,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.25,0.0,0.166667,0.0,1.0,0.500000,0.6,0.857143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,1.0,0.0,1.0,0.383721,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,1.0,0.00,0.5,0.000000,0.0,0.0,0.333333,0.6,0.000000
103694,1.0,1.0,1.0,0.197674,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.50,0.0,0.000000,0.0,1.0,0.833333,1.0,0.857143
131932,0.0,1.0,1.0,0.151163,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.25,0.0,0.100000,0.0,0.0,0.666667,1.0,1.000000
146867,0.0,0.0,0.0,0.127907,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.25,0.0,0.000000,0.0,0.0,0.333333,1.0,0.714286


In [36]:
for i in range(4,9):
    lr = LogisticRegression(solver="liblinear")
    rfe_lr = RFE(estimator=lr, n_features_to_select=i)
    rfe_lr.fit(X_train, y_train)
    selected_features = X.columns[rfe_lr.support_]
    # X_selected = X[selected_features]
    print(selected_features)


Index(['HighChol', 'Stroke', 'GenHlth', 'Age'], dtype='object')
Index(['HighChol', 'Stroke', 'GenHlth', 'Sex', 'Age'], dtype='object')
Index(['HighBP', 'HighChol', 'Stroke', 'GenHlth', 'Sex', 'Age'], dtype='object')
Index(['HighBP', 'HighChol', 'CholCheck', 'Stroke', 'GenHlth', 'Sex', 'Age'], dtype='object')
Index(['HighBP', 'HighChol', 'CholCheck', 'Stroke', 'GenHlth', 'Sex', 'Age',
       'Income'],
      dtype='object')


In [37]:
# make evenly split dataframe using all positive diagnoses and a random sample from negative diagnoses
df = pd.read_csv('heart_disease_health_indicators_BRFSS2015.csv')
df_ones = df[df['HeartDiseaseorAttack'] == 1]
df_zeros = df[df['HeartDiseaseorAttack'] == 0]
df_zeros_sampled = df_zeros.sample(n=len(df_ones), random_state=42)
df_balanced = pd.concat([df_ones, df_zeros_sampled], axis=0)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [38]:
X = df_balanced.iloc[:, 1:]
y = df_balanced.iloc[:, 0]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.3, random_state=42)

In [39]:
for i in range(4,9):
    lr = LogisticRegression(solver="liblinear")
    rfe_lr = RFE(estimator=lr, n_features_to_select=i)
    rfe_lr.fit(X_train, y_train)
    selected_features = X.columns[rfe_lr.support_]
    # X_selected = X[selected_features]
    print(selected_features)

Index(['HighChol', 'Stroke', 'GenHlth', 'Age'], dtype='object')
Index(['HighChol', 'Stroke', 'GenHlth', 'Sex', 'Age'], dtype='object')
Index(['HighChol', 'Stroke', 'GenHlth', 'Sex', 'Age', 'Income'], dtype='object')
Index(['HighBP', 'HighChol', 'Stroke', 'GenHlth', 'Sex', 'Age', 'Income'], dtype='object')
Index(['HighBP', 'HighChol', 'CholCheck', 'Stroke', 'GenHlth', 'Sex', 'Age',
       'Income'],
      dtype='object')
