# 당뇨와 고혈압 질병 예측
- BTH_G : 연령(그룹)
- SBP : 수축기혈압
- DBP : 이완기혈압
- FBS : 공복혈당
- SEX : 성별(남성:1, 여성:2)
- DIS : 고혈압/당뇨병 진료여부
  - 고혈압/당뇨병 진료내역 있음: 1
  - 고혈압 진료내역 있음: 2
  - 당뇨병 진료내역 있음: 3
  - 고혈압/당뇨병 진료내역 없음: 4
- BMI : 체질량지수

## 정상 vs 비정상
- 4(정상) : 0
- 1 / 2 / 3(비정상) : 1

- 0(정상) 기준으로 1(비정상) 오버샘플링 : RandomOversampling
- 1(비정상) : 1 / 2 / 3 각각 레이블링 됨

1 / 2 / 3 = 비율) 1 : 3 : 1
=> 1 : 1 : 1 비율로 오버샘플링

1 / 0 = 비율) 1 : 3
=> 1 : 1 비율로 오버샘플링

# LightGBM
- 데이터 오버샘플링 : RandomOversampling 적용

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np 
import pandas as pd # 시리즈, 데이터프레임 포맷 데이터 처리를 위한 라이브러리
import warnings; warnings.filterwarnings(action='ignore') # 경고 메시지 무시
import matplotlib.pyplot as plt # 데이터 시각화 라이브러리
import pickle # 객체 입출력을 위한 라이브러리

from sklearn.model_selection import train_test_split # 훈련 데이터, 테스트 데이터 분리
from sklearn.preprocessing import StandardScaler # 정규화


# LightGBM 임포트
from lightgbm import LGBMClassifier

from pandas.core.common import random_state
from sklearn.ensemble import RandomForestClassifier # 랜덤포레스트 분류 알고리즘
from sklearn.tree import DecisionTreeClassifier # 의사결정나무 분류 알고리즘

import matplotlib.pyplot as plt
import seaborn as sns

# 모델 평가를 위한 metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, plot_confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## 데이터 불러오기

In [3]:
# Disease 

df = pd.read_csv('/content/drive/MyDrive/health/data/data(over_dis_vs_nor).csv', encoding='cp949') # 원본 그대로 오버샘플링(1:3:1) : 스모트
df

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS,Dis
0,0,1,90,50,86,20.1,2,1
1,0,1,120,80,90,27.0,3,1
2,0,1,100,60,112,31.8,2,1
3,0,1,170,120,86,23.3,2,1
4,0,1,135,80,104,35.0,2,1
...,...,...,...,...,...,...,...,...
1481319,1,21,130,90,185,26.3,3,1
1481320,1,18,140,100,120,28.8,2,1
1481321,0,12,100,60,101,23.0,3,1
1481322,1,20,134,72,168,23.1,3,1


In [4]:
df.drop(df[df['Dis']==0].index, inplace=True)
df

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS,Dis
0,0,1,90,50,86,20.1,2,1
1,0,1,120,80,90,27.0,3,1
2,0,1,100,60,112,31.8,2,1
3,0,1,170,120,86,23.3,2,1
4,0,1,135,80,104,35.0,2,1
...,...,...,...,...,...,...,...,...
1481319,1,21,130,90,185,26.3,3,1
1481320,1,18,140,100,120,28.8,2,1
1481321,0,12,100,60,101,23.0,3,1
1481322,1,20,134,72,168,23.1,3,1


In [5]:
# 1,0 정답 레이블 삭제
df.drop(['Dis'], axis=1, inplace=True)
df

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS
0,0,1,90,50,86,20.1,2
1,0,1,120,80,90,27.0,3
2,0,1,100,60,112,31.8,2
3,0,1,170,120,86,23.3,2
4,0,1,135,80,104,35.0,2
...,...,...,...,...,...,...,...
1481319,1,21,130,90,185,26.3,3
1481320,1,18,140,100,120,28.8,2
1481321,0,12,100,60,101,23.0,3
1481322,1,20,134,72,168,23.1,3


In [6]:
df.notnull().sum()

SEX      740662
BTH_G    740662
SBP      740662
DBP      740662
FBS      740662
BMI      740662
DIS      740662
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 740662 entries, 0 to 1481323
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   SEX     740662 non-null  int64  
 1   BTH_G   740662 non-null  int64  
 2   SBP     740662 non-null  int64  
 3   DBP     740662 non-null  int64  
 4   FBS     740662 non-null  int64  
 5   BMI     740662 non-null  float64
 6   DIS     740662 non-null  int64  
dtypes: float64(1), int64(6)
memory usage: 45.2 MB


In [8]:
# # 데이터 정규화

# # 표준 스케일러(평균 0, 분산 1)
# scaler = StandardScaler()

# feature와 target 분리(설명변수 및 목표변수 분리)
x = df.drop(['DIS'], axis=1) # df[df.columns.difference(['target'])]
y = np.array(df['DIS']) # y = df['target']

# # 설명변수 데이터 스케일링
# x_train = scaler.fit_transform(x) # x_scaled = scaler.fit_transform(x)

In [9]:
# 전체 데이터에서 학습데이터셋과 테스트셋을 나눔 # 1:1 (5:5) 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, shuffle=True, random_state = 42) # 기본 0.25

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# train과 test데이터의 비율을 6:4로 나누어주고
# 0.4비율이 된 test데이터를 다시 각 0.5 비율로 validation과 test를 나누어 줌
# 최종 결과가 train : test : validation = 6 : 2 : 2 비율

# # train set에서 train과 validation set를 나눔 # x,y,test_size=0.4
# x_test,x_val,y_test,y_val=train_test_split(x_test,y_test,test_size=0.5, shuffle=True)

print(x_train.shape)
print(y_train.shape)
# print(x_val.shape)
# print(y_val.shape)
print(x_test.shape)
print(y_test.shape)

(592529, 6)
(592529,)
(148133, 6)
(148133,)


In [10]:
# 라이트GBM without tuning
import lightgbm as lgb

lgb_clf = lgb.LGBMClassifier(num_leaves=31, objective='binary')
lgb_clf.fit(x_train, y_train)
y_pred = lgb_clf.predict(x_test)
accuracy_score(y_test, y_pred)

d_train = lgb.Dataset(x_train, label=y_train)
param = {'num_iterations': 100, 'learning_rate' : 0.1, 'max_depth' : -1, 'objective': 'binary', 'metric':'auc'}

# params = {}
# params['num_iterations'] = 100
# params['learning_rate'] = 0.1
# params['boosting_type'] = 'gbdt'
# params['objective'] = 'binary'
# params['bagging_fraction'] = 1.0
# params['metric'] = 'binary_logloss'
# params['sub_feature'] = 0.5
# params['num_leaves'] = 31
# params['min_data'] = 20
# params['max_depth'] = 1
# clf = lgb.train(params, d_train, 100)

In [11]:
# 데이터 100만 개 사용
print(confusion_matrix(y_test, y_pred))

[[23245 15219 10982]
 [ 6485 38225  4354]
 [12533 12664 24426]]


In [12]:
lgb = LGBMClassifier(n_estimators=100, random_state=42).fit(x_train,y_train)

print("train score : {}".format(lgb.score(x_train, y_train)))
#print("val score : {}".format(lgb.score(x_val, y_val)))

#print("test score : {}".format(lgb.score(x_test,y_test)))
predict_y = lgb.predict(x_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, lgb.predict(x_test)))
print("test score : {}".format(lgb.score(x_test, y_test)))

train score : 0.5860607666460207
              precision    recall  f1-score   support

           1       0.55      0.47      0.51     49446
           2       0.58      0.78      0.66     49064
           3       0.61      0.49      0.55     49623

    accuracy                           0.58    148133
   macro avg       0.58      0.58      0.57    148133
weighted avg       0.58      0.58      0.57    148133

test score : 0.5796345176294276


# RandomOversampling + 질병 1, 2, 3 비율 => 1 : 1 : 1

In [13]:
# Disease 

df2 = pd.read_csv('/content/drive/MyDrive/health/data/data(over_dis_vs_nor).csv', encoding='cp949') # 1~3 비율 맞춰서 오버샘플링 : 스모트
df2

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS,Dis
0,0,1,90,50,86,20.1,2,1
1,0,1,120,80,90,27.0,3,1
2,0,1,100,60,112,31.8,2,1
3,0,1,170,120,86,23.3,2,1
4,0,1,135,80,104,35.0,2,1
...,...,...,...,...,...,...,...,...
1481319,1,21,130,90,185,26.3,3,1
1481320,1,18,140,100,120,28.8,2,1
1481321,0,12,100,60,101,23.0,3,1
1481322,1,20,134,72,168,23.1,3,1


In [14]:
df2.drop(df2[df2['Dis']==0].index, inplace=True)
df2

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS,Dis
0,0,1,90,50,86,20.1,2,1
1,0,1,120,80,90,27.0,3,1
2,0,1,100,60,112,31.8,2,1
3,0,1,170,120,86,23.3,2,1
4,0,1,135,80,104,35.0,2,1
...,...,...,...,...,...,...,...,...
1481319,1,21,130,90,185,26.3,3,1
1481320,1,18,140,100,120,28.8,2,1
1481321,0,12,100,60,101,23.0,3,1
1481322,1,20,134,72,168,23.1,3,1


In [15]:
# 1,0 정답 레이블 삭제
df2.drop(['Dis'], axis=1, inplace=True)
df2

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS
0,0,1,90,50,86,20.1,2
1,0,1,120,80,90,27.0,3
2,0,1,100,60,112,31.8,2
3,0,1,170,120,86,23.3,2
4,0,1,135,80,104,35.0,2
...,...,...,...,...,...,...,...
1481319,1,21,130,90,185,26.3,3
1481320,1,18,140,100,120,28.8,2
1481321,0,12,100,60,101,23.0,3
1481322,1,20,134,72,168,23.1,3


In [16]:
df2.notnull().sum()

SEX      740662
BTH_G    740662
SBP      740662
DBP      740662
FBS      740662
BMI      740662
DIS      740662
dtype: int64

In [17]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 740662 entries, 0 to 1481323
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   SEX     740662 non-null  int64  
 1   BTH_G   740662 non-null  int64  
 2   SBP     740662 non-null  int64  
 3   DBP     740662 non-null  int64  
 4   FBS     740662 non-null  int64  
 5   BMI     740662 non-null  float64
 6   DIS     740662 non-null  int64  
dtypes: float64(1), int64(6)
memory usage: 45.2 MB


In [18]:
# # 데이터 정규화

# # 표준 스케일러(평균 0, 분산 1)
# scaler = StandardScaler()

# feature와 target 분리(설명변수 및 목표변수 분리)
x = df2.drop(['DIS'], axis=1) # df[df.columns.difference(['target'])]
y = np.array(df2['DIS']) # y = df['target']

# # 설명변수 데이터 스케일링
# x_train = scaler.fit_transform(x) # x_scaled = scaler.fit_transform(x)

In [19]:
# 전체 데이터에서 학습데이터셋과 테스트셋을 나눔
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, shuffle=True, random_state = 42) # 기본 0.25

# train과 test데이터의 비율을 6:4로 나누어주고
# 0.4비율이 된 test데이터를 다시 각 0.5 비율로 validation과 test를 나누어 줌
# 최종 결과가 train : test : validation = 6 : 2 : 2 비율

# # train set에서 train과 validation set를 나눔 # x,y,test_size=0.4
# x_test,x_val,y_test,y_val=train_test_split(x_test,y_test,test_size=0.5, shuffle=True)

print(x_train.shape)
print(y_train.shape)
# print(x_val.shape)
# print(y_val.shape)
print(x_test.shape)
print(y_test.shape)

(592529, 6)
(592529,)
(148133, 6)
(148133,)


In [20]:
# 라이트GBM without tuning
import lightgbm as lgb

lgb_clf = lgb.LGBMClassifier(num_leaves=31, objective='binary')
lgb_clf.fit(x_train, y_train)
y_pred = lgb_clf.predict(x_test)
accuracy_score(y_test, y_pred)

d_train = lgb.Dataset(x_train, label=y_train)
param = {'num_iterations': 100, 'learning_rate' : 0.1, 'max_depth' : -1, 'objective': 'binary', 'metric':'auc'}

# params = {}
# params['num_iterations'] = 100
# params['learning_rate'] = 0.1
# params['boosting_type'] = 'gbdt'
# params['objective'] = 'binary'
# params['bagging_fraction'] = 1.0
# params['metric'] = 'binary_logloss'
# params['sub_feature'] = 0.5
# params['num_leaves'] = 31
# params['min_data'] = 20
# params['max_depth'] = 1
# clf = lgb.train(params, d_train, 100)

In [21]:
# 데이터 100만 개 사용
print(confusion_matrix(y_test, y_pred))

[[23257 15227 10962]
 [ 6468 38210  4386]
 [12512 12707 24404]]


In [22]:
lgb = LGBMClassifier(n_estimators=100, random_state=42).fit(x_train,y_train)

print("train score : {}".format(lgb.score(x_train, y_train)))
#print("val score : {}".format(lgb.score(x_val, y_val)))

#print("test score : {}".format(lgb.score(x_test,y_test)))
predict_y = lgb.predict(x_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, lgb.predict(x_test)))
print("test score : {}".format(lgb.score(x_test, y_test)))

train score : 0.585772173176334
              precision    recall  f1-score   support

           1       0.55      0.47      0.51     49446
           2       0.58      0.78      0.66     49064
           3       0.61      0.49      0.55     49623

    accuracy                           0.58    148133
   macro avg       0.58      0.58      0.57    148133
weighted avg       0.58      0.58      0.57    148133

test score : 0.5796210162489115
