In [5]:
import math
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [16]:
cvd_df = pd.read_csv('cardio_train.csv', sep=';').drop(columns='id')
cvd_df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [17]:
# 나이 부분이 일수로 나와있으니, 년으로 바꾸어줄게요.
cvd_df['age'] = cvd_df['age']/365.24
cvd_df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50.358668,2,168,62.0,110,80,1,1,0,0,1,0
1,55.382762,1,156,85.0,140,90,3,1,0,0,1,1
2,51.629066,1,165,64.0,130,70,3,1,0,0,0,1
3,48.250465,2,169,82.0,150,100,1,1,0,0,1,1
4,47.842515,1,156,56.0,100,60,1,1,0,0,0,0


In [18]:
cvd_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,70000.0,53.304309,6.755152,29.564122,48.36272,53.945351,58.391742,64.924433
gender,70000.0,1.349571,0.476838,1.0,1.0,1.0,2.0,2.0
height,70000.0,164.359229,8.210126,55.0,159.0,165.0,170.0,250.0
weight,70000.0,74.20569,14.395757,10.0,65.0,72.0,82.0,200.0
ap_hi,70000.0,128.817286,154.011419,-150.0,120.0,120.0,140.0,16020.0
ap_lo,70000.0,96.630414,188.47253,-70.0,80.0,80.0,90.0,11000.0
cholesterol,70000.0,1.366871,0.68025,1.0,1.0,1.0,2.0,3.0
gluc,70000.0,1.226457,0.57227,1.0,1.0,1.0,1.0,3.0
smoke,70000.0,0.088129,0.283484,0.0,0.0,0.0,0.0,1.0
alco,70000.0,0.053771,0.225568,0.0,0.0,0.0,0.0,1.0


In [21]:
# 혈압은 음수일 수 없으며, 최고 기록은 370이다. 이런 이상치는 모델의 성능을 저하시키므로 없애버린다.
cvd_df = cvd_df[(cvd_df['ap_lo']<=370)&(cvd_df['ap_lo']>0)].reset_index(drop=True)
cvd_df = cvd_df[(cvd_df['ap_hi']<=370)&(cvd_df['ap_hi']>0)].reset_index(drop=True)
cvd_df = cvd_df[cvd_df['ap_hi']>=cvd_df['ap_lo']]
cvd_df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50.358668,2,168,62.0,110,80,1,1,0,0,1,0
1,55.382762,1,156,85.0,140,90,3,1,0,0,1,1
2,51.629066,1,165,64.0,130,70,3,1,0,0,0,1
3,48.250465,2,169,82.0,150,100,1,1,0,0,1,1
4,47.842515,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
68706,52.677691,2,168,76.0,120,80,1,1,1,0,1,0
68707,61.879860,1,158,126.0,140,90,2,2,0,0,1,1
68708,52.201292,2,183,105.0,180,90,3,1,0,1,0,1
68709,61.414412,1,163,72.0,135,80,1,2,0,0,0,1


In [22]:
X = cvd_df.drop(columns='cardio')
y = cvd_df['cardio']

In [23]:
train_input, test_input, train_target, test_target = train_test_split(X,
                                                                      y,
                                                                      test_size=0.2,
                                                                      random_state=42)

In [27]:
log_model = sm.Logit(train_target, sm.add_constant(train_input))
log_result = log_model.fit()
print(log_result.summary2())

Optimization terminated successfully.
         Current function value: 0.560003
         Iterations 6
                         Results: Logit
Model:              Logit            Method:           MLE       
Dependent Variable: cardio           Pseudo R-squared: 0.192     
Date:               2025-01-21 20:46 AIC:              61588.4463
No. Observations:   54968            BIC:              61695.4204
Df Model:           11               Log-Likelihood:   -30782.   
Df Residuals:       54956            LL-Null:          -38098.   
Converged:          1.0000           LLR p-value:      0.0000    
No. Iterations:     6.0000           Scale:            1.0000    
-----------------------------------------------------------------
               Coef.   Std.Err.    z     P>|z|   [0.025   0.975] 
-----------------------------------------------------------------
const         -11.2453   0.2596 -43.3142 0.0000 -11.7541 -10.7364
age             0.0520   0.0015  34.3111 0.0000   0.0491   0.0550


In [28]:
np.exp(log_result.params).sort_values(ascending=False)

cholesterol    1.637187
ap_hi          1.058266
age            1.053415
ap_lo          1.010983
weight         1.010721
height         0.995886
gender         0.986722
smoke          0.907957
gluc           0.900557
active         0.802750
alco           0.792081
const          0.000013
dtype: float64

In [34]:
np.std(train_input)

  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


age             6.754027
gender          0.476672
height          8.178808
weight         14.324324
ap_hi          16.657852
ap_lo           9.553014
cholesterol     0.678428
gluc            0.573655
smoke           0.284056
alco            0.224928
active          0.397792
dtype: float64

In [36]:
coefs = log_result.params.drop(labels=['const', 'gender'])
stdv = np.std(train_input).drop(labels='gender')
abs(coefs*stdv).sort_values(ascending=False)

  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


ap_hi          0.943358
age            0.351464
cholesterol    0.334451
weight         0.152760
ap_lo          0.104345
active         0.087400
gluc           0.060086
alco           0.052429
height         0.033718
smoke          0.027428
dtype: float64

In [43]:
y_pred = log_result.predict(sm.add_constant(test_input)).to_numpy()
print(y_pred)

[0.89518288 0.61109794 0.66966689 ... 0.32276199 0.43760216 0.66018572]


In [44]:
y_pred[0]

np.float64(0.8951828755007194)

In [45]:
test_input.iloc[0]

age             56.817435
gender           2.000000
height         172.000000
weight          70.000000
ap_hi          150.000000
ap_lo           90.000000
cholesterol      3.000000
gluc             3.000000
smoke            0.000000
alco             0.000000
active           1.000000
Name: 41747, dtype: float64