In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 50)

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

## Load dataset

In [3]:
df = pd.read_csv('data/train.csv')
print(df.shape)
df.head()

(159256, 24)


Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,87,94,172,300,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,83,147,194,55,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,75,79,178,197,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,88,91,180,203,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,76,91,155,87,44,93,15.4,1,0.8,19,13,17,0,1


In [5]:
features = df.drop(columns=['id','smoking']).columns
print(features)
print(len(features))

Index(['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries'],
      dtype='object')
22


## Binning
Explained in 1_dataset_binarizing.ipynb

In [6]:
for col in features:
    
    if df[col].nunique() == 2:
        u = df[col].unique()[1]
        result = df[col] == u
        df[col] = result
    else:
        quantiles = pd.cut(df[col], bins=10, retbins=True)[1]

        quantiles = sorted(set(quantiles))


        for q in quantiles:
            if q != quantiles[0]: 
                df[f'{col}>{q:.0f}'] = df[col] > q

        df = df.drop(columns=[col])
        
        
list(df.columns)

['id',
 'hearing(left)',
 'hearing(right)',
 'dental caries',
 'smoking',
 'age>26',
 'age>33',
 'age>40',
 'age>46',
 'age>52',
 'age>59',
 'age>66',
 'age>72',
 'age>78',
 'age>85',
 'height(cm)>140',
 'height(cm)>146',
 'height(cm)>152',
 'height(cm)>157',
 'height(cm)>162',
 'height(cm)>168',
 'height(cm)>174',
 'height(cm)>179',
 'height(cm)>184',
 'height(cm)>190',
 'weight(kg)>40',
 'weight(kg)>50',
 'weight(kg)>60',
 'weight(kg)>70',
 'weight(kg)>80',
 'weight(kg)>90',
 'weight(kg)>100',
 'weight(kg)>110',
 'weight(kg)>120',
 'weight(kg)>130',
 'waist(cm)>59',
 'waist(cm)>66',
 'waist(cm)>74',
 'waist(cm)>81',
 'waist(cm)>89',
 'waist(cm)>97',
 'waist(cm)>104',
 'waist(cm)>112',
 'waist(cm)>119',
 'waist(cm)>127',
 'eyesight(left)>1',
 'eyesight(left)>2',
 'eyesight(left)>3',
 'eyesight(left)>4',
 'eyesight(left)>5',
 'eyesight(left)>6',
 'eyesight(left)>7',
 'eyesight(left)>8',
 'eyesight(left)>9',
 'eyesight(left)>10',
 'eyesight(right)>1',
 'eyesight(right)>2',
 'eyesight(ri

In [7]:
df.head(1)

Unnamed: 0,id,hearing(left),hearing(right),dental caries,smoking,age>26,age>33,age>40,age>46,age>52,age>59,age>66,age>72,age>78,age>85,height(cm)>140,height(cm)>146,height(cm)>152,height(cm)>157,height(cm)>162,height(cm)>168,height(cm)>174,height(cm)>179,height(cm)>184,height(cm)>190,...,AST>469,AST>546,AST>624,AST>701,AST>778,ALT>292,ALT>584,ALT>875,ALT>1166,ALT>1458,ALT>1749,ALT>2040,ALT>2331,ALT>2623,ALT>2914,Gtp>102,Gtp>201,Gtp>301,Gtp>401,Gtp>500,Gtp>600,Gtp>700,Gtp>800,Gtp>899,Gtp>999
0,0,False,False,False,1,True,True,True,True,True,False,False,False,False,False,True,True,True,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [11]:
df['Gtp>500'].value_counts()

False    159214
True         42
Name: Gtp>500, dtype: int64