In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import StandardScaler

import sys
sys.path.append("C:/Auto_ML/")
from classification.logreg import LogReg
from classification.dtree_clf import DTreeClassifier
from classification.gaussiannb import NB

%matplotlib inline

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
heart = pd.read_csv('heart_2020_cleaned.csv')

In [3]:
heart.head(15)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
5,Yes,28.87,Yes,No,No,6.0,0.0,Yes,Female,75-79,Black,No,No,Fair,12.0,No,No,No
6,No,21.63,No,No,No,15.0,0.0,No,Female,70-74,White,No,Yes,Fair,4.0,Yes,No,Yes
7,No,31.64,Yes,No,No,5.0,0.0,Yes,Female,80 or older,White,Yes,No,Good,9.0,Yes,No,No
8,No,26.45,No,No,No,0.0,0.0,No,Female,80 or older,White,"No, borderline diabetes",No,Fair,5.0,No,Yes,No
9,No,40.69,No,No,No,0.0,0.0,Yes,Male,65-69,White,No,Yes,Good,10.0,No,No,No


In [4]:
heart.shape

(319795, 18)

In [5]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [6]:
heart.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [7]:
def change_to_binary(x):
    if x == 'Yes':
        return 1
    elif x == 'No':
        return 0
    else:
        return np.NaN

heart['HeartDisease'] = heart['HeartDisease'].apply(lambda x: change_to_binary(x))

In [8]:
def is_yes_no(x):
    if x == 'Yes' or x == 'No':
        return True
    else:
        return False

columns_to_change = [column for column in heart.columns if all(map(lambda x: is_yes_no(x), heart[column]))]
columns_to_change
for column in columns_to_change:
    heart[column] = heart[column].apply(lambda x: change_to_binary(x))

# One Hot Encoding

In [9]:
heart.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [10]:
heart = pd.get_dummies(heart, prefix_sep='_')
heart.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,PhysicalActivity,SleepTime,...,Race_White,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy),GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good
0,0,16.6,1,0,0,3.0,30.0,0,1,5.0,...,True,False,False,True,False,False,False,False,False,True
1,0,20.34,0,0,1,0.0,0.0,0,1,7.0,...,True,True,False,False,False,False,False,False,False,True
2,0,26.58,1,0,0,20.0,30.0,0,1,8.0,...,True,False,False,True,False,False,True,False,False,False
3,0,24.21,0,0,0,0.0,0.0,0,0,6.0,...,True,True,False,False,False,False,False,True,False,False
4,0,23.71,0,0,0,28.0,0.0,1,1,8.0,...,True,True,False,False,False,False,False,False,False,True


In [11]:
X = heart.drop(['HeartDisease'], axis=1)
y = heart['HeartDisease']

In [12]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
logreg??

Object `logreg` not found.


In [15]:
# logreg = LogReg(X_train=X_train, X_test = X_test, y_train = y_train, y_test = y_test)
# logreg.run_logreg_models()

In [16]:
# dtree = DTreeClassifier(X_train=X_train, X_test = X_test, y_train = y_train, y_test = y_test)
# dtree.run_dtree_models()

In [17]:
gaussiannb = NB(X_train=X_train, X_test = X_test, y_train = y_train, y_test = y_test)
gaussiannb.run_gaussian_models()

Gaussian Naive Bayes before tuning:               precision    recall  f1-score   support

           0       0.97      0.71      0.82     58444
           1       0.20      0.80      0.32      5515

    accuracy                           0.71     63959
   macro avg       0.59      0.75      0.57     63959
weighted avg       0.91      0.71      0.78     63959

Gaussian Naive Bayes after tuning:               precision    recall  f1-score   support

           0       0.95      0.89      0.92     58444
           1       0.30      0.50      0.38      5515

    accuracy                           0.86     63959
   macro avg       0.63      0.69      0.65     63959
weighted avg       0.89      0.86      0.87     63959

