In [1]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

In [11]:
train = pd.read_csv('train.csv')

In [18]:
#이상치 제거

def create_extra_features(df):
    # order the ears
    best = np.where(df['hearing(left)'] < df['hearing(right)'],
                    df['hearing(left)'],  df['hearing(right)'])
    worst = np.where(df['hearing(left)'] < df['hearing(right)'],
                     df['hearing(right)'],  df['hearing(left)'])
    df['hearing(left)'] = best - 1
    df['hearing(right)'] = worst - 1

    # order the eyes - eyesight is worst to best, and 9+ should be worst!
    df['eyesight(left)'] = np.where(df['eyesight(left)'] > 9, 0, df['eyesight(left)'])
    df['eyesight(right)'] = np.where(df['eyesight(right)'] > 9, 0, df['eyesight(right)'])
    best = np.where(df['eyesight(left)'] < df['eyesight(right)'],
                    df['eyesight(left)'],  df['eyesight(right)'])
    worst = np.where(df['eyesight(left)'] < df['eyesight(right)'],
                     df['eyesight(right)'],  df['eyesight(left)'])
    df['eyesight(left)'] = best
    df['eyesight(right)'] = worst
    ##
    df['Gtp'] = np.clip(df['Gtp'], 0, 300)
    df['HDL'] = np.clip(df['HDL'], 0, 110)
    df['LDL'] = np.clip(df['LDL'], 0, 200)
    df['ALT'] = np.clip(df['ALT'], 0, 150)
    df['AST'] = np.clip(df['AST'], 0, 100)
    df['serum creatinine'] = np.clip(df['serum creatinine'], 0, 3)

create_extra_features(train)

In [19]:
#파생변수

train['BMI'] = train['weight(kg)'] / (train['height(cm)']/100) ** 2
train['Pulse Pressure'] = train['systolic'] - train['relaxation']
train['HW_Ratio'] = train['height(cm)'] / train['waist(cm)']
train['HA_Ratio'] = train['height(cm)'] / train['age']
train['hemoglobin_age_product'] = train['hemoglobin'] * (train['age'] /train['age'].max())


In [20]:
#나이에 대한 양자 기반 구간(quantile-based bins) 생성

quantile_bins = pd.qcut(train['age'], q=5, labels=False, precision=0)

#파생변수

# For systolic blood pressure index
grouped_systolic = train.groupby(quantile_bins)['systolic'].transform('mean')
train['Blood Pressure Index'] = train['systolic'] / grouped_systolic

# For cholesterol age interaction
train['Cholesterol Age Interaction'] = train['Cholesterol'] * train['age'] / 100

# For weight adjusted by age
grouped_weight = train.groupby(quantile_bins)['weight(kg)'].transform('mean')
train['Weight Adjusted by Age'] = train['weight(kg)'] / grouped_weight

# For height to age ratio
grouped_height = train.groupby(quantile_bins)['height(cm)'].transform('mean')
train['Height to Age Ratio'] = train['height(cm)'] / grouped_height

# For blood pressure to BMI ratio
train['Blood Pressure to BMI Ratio'] = train['systolic'] / train['BMI']

# Let's check the recalculated features
recalculated_features = train[['Blood Pressure Index', 'Cholesterol Age Interaction', 'Weight Adjusted by Age',
                              'Height to Age Ratio', 'Blood Pressure to BMI Ratio']]

recalculated_features.describe()

Unnamed: 0,Blood Pressure Index,Cholesterol Age Interaction,Weight Adjusted by Age,Height to Age Ratio,Blood Pressure to BMI Ratio
count,159256.0,159256.0,159256.0,159256.0,159256.0
mean,1.0,87.110068,1.0,1.0,5.083508
std,0.102004,27.157253,0.173665,0.04609,0.74886
min,0.628312,21.2,0.400795,0.784018,2.695
25%,0.926842,68.6,0.86839,0.963423,4.568615
50%,0.992771,85.05,0.997042,1.006166,5.019667
75%,1.067369,104.8,1.11342,1.037606,5.5296
max,1.67288,221.25,1.917388,1.194822,10.83555


In [21]:
train.columns

Index(['id', 'age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries', 'smoking', 'BMI', 'Pulse Pressure',
       'HW_Ratio', 'HA_Ratio', 'hemoglobin_age_product',
       'Blood Pressure Index', 'Cholesterol Age Interaction',
       'Weight Adjusted by Age', 'Height to Age Ratio',
       'Blood Pressure to BMI Ratio'],
      dtype='object')

In [22]:
train.drop('id', axis=1, inplace=True)

In [23]:
correlation = train.corr()['smoking'].sort_values(ascending=False)

print(correlation)

smoking                        1.000000
hemoglobin                     0.450679
height(cm)                     0.447111
Height to Age Ratio            0.398820
weight(kg)                     0.351748
triglyceride                   0.331975
Gtp                            0.324938
Weight Adjusted by Age         0.304809
serum creatinine               0.277829
waist(cm)                      0.262715
HA_Ratio                       0.224441
ALT                            0.189036
eyesight(right)                0.147209
eyesight(left)                 0.143498
BMI                            0.141723
relaxation                     0.109501
dental caries                  0.106636
fasting blood sugar            0.096534
Blood Pressure Index           0.092298
AST                            0.064617
systolic                       0.058642
Pulse Pressure                -0.028398
Urine protein                 -0.028548
hearing(left)                 -0.033593
hemoglobin_age_product        -0.040100


In [25]:
from time import thread_time_ns

file_name = 'train2.csv'
train.to_csv(file_name, index=False)

In [24]:
#중요도 계산

from sklearn.ensemble import RandomForestClassifier
import pandas as pd


features = ['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
            'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
            'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
            'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
            'ALT', 'Gtp', 'dental caries', 'BMI', 'Pulse Pressure',
            'HW_Ratio', 'HA_Ratio', 'hemoglobin_age_product',
            'Blood Pressure Index', 'Cholesterol Age Interaction',
            'Weight Adjusted by Age', 'Height to Age Ratio',
            'Blood Pressure to BMI Ratio']


X = train[features]
y = train['smoking']

model = RandomForestClassifier(random_state=42)

model.fit(X, y)

feature_importances = model.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
sorted_importance = feature_importance_df.sort_values(by='Importance', ascending=False)
print(sorted_importance)


                        Feature  Importance
15                   hemoglobin    0.095441
20                          Gtp    0.079693
1                    height(cm)    0.077177
12                 triglyceride    0.067092
30          Height to Age Ratio    0.049258
28  Cholesterol Age Interaction    0.036944
25                     HA_Ratio    0.035630
13                          HDL    0.034428
26       hemoglobin_age_product    0.033741
14                          LDL    0.033660
19                          ALT    0.031287
11                  Cholesterol    0.030957
17             serum creatinine    0.030601
10          fasting blood sugar    0.030082
31  Blood Pressure to BMI Ratio    0.029262
24                     HW_Ratio    0.028551
18                          AST    0.028498
3                     waist(cm)    0.028398
9                    relaxation    0.024921
27         Blood Pressure Index    0.024911
2                    weight(kg)    0.024727
23               Pulse Pressure 