In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.4.1-py2.py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 3.5 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.4.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [141]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#method : scaling, log transformation, cube Root normalization, Box-Cox transformation
from scipy.stats import skew 
from scipy.stats import boxcox

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [142]:
df = pd.read_csv('/content/drive/MyDrive/heart_2020_cleaned.csv')
df.shape

(319795, 18)

In [143]:
df = df.drop_duplicates()
df.shape

(301717, 18)

In [144]:
df = df[(df['BMI'] < 80) & (df['SleepTime'] != 24)]
df.shape

(301653, 18)

In [145]:
def yesno_tolabel(x):
    if (x == 'Yes') | (x == 'Yes (during pregnancy)') :
        return 1
    else:
        return 0

In [146]:
columns = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Diabetic', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']
sex_map = {'Female' : 0, 'Male' : 1}
health_map = {'Excellent' : 2, 'Very good' : 2, 'Good' : 2, 'Fair' : 1, 'Poor' : 0}
age_map = {'18-24' : 20, '25-29' : 25, '30-34' : 30, '35-39' : 35, '40-44' : 40, '45-49' : 45, '50-54' : 50, '55-59' : 55, '60-64' : 60, '65-69' : 65, '70-74' : 70, '75-79' : 75, '80 or older' : 80}

for column in columns:
    df[column] = df[column].apply(lambda x : yesno_tolabel(x))

df['Sex'] = df['Sex'].map(sex_map)
df['GenHealth'] = df['GenHealth'].map(health_map)
df['AgeCategory'] = df['AgeCategory'].map(age_map)


In [147]:
def quantile_operation(outlier_df, column):
    quantile_1 = outlier_df[column].quantile(0.25)
    quantile_3 = outlier_df[column].quantile(0.75)
    iqr = quantile_3 - quantile_1
    Lower_tail = quantile_1 - 1.5 * iqr
    Upper_tail = quantile_3 + 1.5 * iqr

    return Lower_tail, Upper_tail

In [148]:
df['PhysicalHealth'] = df['PhysicalHealth'] + 1
df['MentalHealth'] = df['MentalHealth'] + 1
outlier_columns = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
for column in outlier_columns:
    Lower_tail, Upper_tail = quantile_operation(df, column)
    df[column] = boxcox(df[column])[0]

In [151]:
#Race 인코딩 - Binary
import category_encoders as ce
import pandas as pd

ce_binary = ce.BinaryEncoder(cols = ['Race'], return_df = True)
ce_binary_encoded = ce_binary.fit_transform(df['Race'])

df = pd.concat([df, ce_binary_encoded], axis = 1).drop(['Race'], axis = 1)

KeyError: ignored

In [149]:
#Race 인코딩 - target encoding
categories = df['Race'].unique()
targets = df['HeartDisease'].unique()
cat_list = []

for cat in categories:
    aux_dict = {}
    aux_dict['category'] = cat
    aux_df = df[df['Race'] == cat]
    counts = aux_df['HeartDisease'].value_counts()
    aux_dict['count'] = sum(counts)
    for t in targets:
        aux_dict['Race_' + str(t)] = counts[t]
    cat_list.append(aux_dict)

cat_list = pd.DataFrame(cat_list)
cat_list['genre_encoded_dumb'] = cat_list['Race_1'] / cat_list['count']

target_dict = {'White' : cat_list['genre_encoded_dumb'][0], 'Black' : cat_list['genre_encoded_dumb'][1], 'Asian' : cat_list['genre_encoded_dumb'][2], 'American Indian/Alaskan Native' : cat_list['genre_encoded_dumb'][3],
               'Other' : cat_list['genre_encoded_dumb'][4], 'Hispanic' : cat_list['genre_encoded_dumb'][5]}
df['Race_encoder'] = df['Race'].map(target_dict)

df.drop(['Race'], axis = 1, inplace = True)

In [None]:
#Race 인코딩 - 원핫인코딩
from sklearn.preprocessing import OneHotEncoder
import numpy as np

ohe = OneHotEncoder(sparse = False)
train_race = ohe.fit_transform(df[['Race']])

ohe_hot_df = pd.DataFrame(train_race, columns = ohe.categories_)

df = df.reset_index().drop('index', axis = 1)

df = pd.concat([df, ohe_hot_df], axis = 1).drop('Race', axis = 1)

df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,SleepTime,Asthma,KidneyDisease,SkinCancer,"(American Indian/Alaskan Native,)","(Asian,)","(Black,)","(Hispanic,)","(Other,)","(White,)"
0,0,1.403407,1,0,0,0.669364,1.089487,0,0,55,...,3.488639,1,0,1,0.0,0.0,0.0,0.0,0.0,1.0
1,0,1.442302,0,0,1,0.000000,0.000000,0,0,80,...,5.054525,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,1.487128,1,0,0,0.801003,1.089487,0,1,65,...,5.813435,1,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,1.472250,0,0,0,0.000000,0.000000,0,0,75,...,4.280563,0,0,1,0.0,0.0,0.0,0.0,0.0,1.0
4,0,1.468816,0,0,0,0.807551,0.000000,1,0,40,...,5.813435,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301648,1,1.491857,1,0,0,0.755892,0.000000,1,1,60,...,4.280563,1,0,0,0.0,0.0,0.0,1.0,0.0,0.0
301649,0,1.504497,1,0,0,0.000000,0.000000,0,1,35,...,3.488639,1,0,0,0.0,0.0,0.0,1.0,0.0,0.0
301650,0,1.472452,0,0,0,0.000000,0.000000,0,0,45,...,4.280563,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
301651,0,1.517915,0,0,0,0.000000,0.000000,0,0,25,...,8.734516,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
