# 당뇨와 고혈압 질병 예측
- BTH_G : 연령(그룹)
- SBP : 수축기혈압
- DBP : 이완기혈압
- FBS : 공복혈당
- SEX : 성별(남성:1, 여성:2)
- DIS : 고혈압/당뇨병 진료여부
  - 고혈압/당뇨병 진료내역 있음: 1
  - 고혈압 진료내역 있음: 2
  - 당뇨병 진료내역 있음: 3
  - 고혈압/당뇨병 진료내역 없음: 4
- BMI : 체질량지수

## 정상 vs 비정상
- 4(정상) : 0
- 1 / 2 / 3(비정상) : 1

- 0(정상) 기준으로 1(비정상) 오버샘플링 : RandomOversampling
- 1(비정상) : 1 / 2 / 3 각각 레이블링 됨

1 / 2 / 3 = 비율) 1 : 3 : 1
=> 1 : 1 : 1 비율로 오버샘플링

1 / 0 = 비율) 1 : 3
=> 1 : 1 비율로 오버샘플링

# 로지스틱 회귀
- 데이터 오버샘플링 : RandomOversampling 적용

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from pandas.core.common import random_state
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn.datasets import make_classification
from collections import Counter

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

%matplotlib inline
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.svm import SVC 
from sklearn.metrics import classification_report, confusion_matrix

## 데이터 불러오기

In [None]:
# Disease 

df = pd.read_csv('/content/drive/MyDrive/health/data/data_over_sampling.csv', encoding='cp949') # 원본 그대로 오버샘플링(1:3:1) : 스모트
df

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS,Dis
0,0,1,116,78,94,16.6,4,0
1,0,1,100,60,79,22.3,4,0
2,0,1,100,60,87,21.9,4,0
3,0,1,111,70,72,20.2,4,0
4,0,1,120,80,98,20.0,4,0
...,...,...,...,...,...,...,...,...
1481319,1,25,120,80,78,27.9,1,1
1481320,1,27,150,74,133,32.0,1,1
1481321,0,25,120,70,118,26.9,3,1
1481322,1,20,136,68,93,21.5,2,1


In [None]:
# 1~4 정답 레이블 삭제
df.drop(['DIS'], axis=1, inplace=True)
df

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,Dis
0,0,1,116,78,94,16.6,0
1,0,1,100,60,79,22.3,0
2,0,1,100,60,87,21.9,0
3,0,1,111,70,72,20.2,0
4,0,1,120,80,98,20.0,0
...,...,...,...,...,...,...,...
1481319,1,25,120,80,78,27.9,1
1481320,1,27,150,74,133,32.0,1
1481321,0,25,120,70,118,26.9,1
1481322,1,20,136,68,93,21.5,1


In [None]:
df.notnull().sum()

SEX      1481324
BTH_G    1481324
SBP      1481324
DBP      1481324
FBS      1481324
BMI      1481324
Dis      1481324
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1481324 entries, 0 to 1481323
Data columns (total 7 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   SEX     1481324 non-null  int64  
 1   BTH_G   1481324 non-null  int64  
 2   SBP     1481324 non-null  int64  
 3   DBP     1481324 non-null  int64  
 4   FBS     1481324 non-null  int64  
 5   BMI     1481324 non-null  float64
 6   Dis     1481324 non-null  int64  
dtypes: float64(1), int64(6)
memory usage: 79.1 MB


In [None]:
# split data

from sklearn.model_selection import train_test_split

# 전체 데이터 split
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'Dis'],df['Dis'],  stratify=df['Dis'], shuffle=True, test_size=0.2, random_state=11)

print(X_train.shape)
print(y_train.shape)
# print(X_val.shape) # validation set은 사용하지 않음
# print(y_val.shape) # 역시
print(X_test.shape)
print(y_test.shape)

(1185059, 6)
(1185059,)
(296265, 6)
(296265,)


In [None]:
from sklearn.linear_model import LogisticRegression # 로지스틱 회귀

LR = LogisticRegression() # 로지스틱 회귀 분석
LR.fit(X_train,y_train)

print("train score : {}".format(LR.score(X_train, y_train)))

predict_y = LR.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, LR.predict(X_test)))
print("test score : {}".format(LR.score(X_test, y_test)))

train score : 0.7899091943945407
              precision    recall  f1-score   support

           0       0.80      0.78      0.79    148132
           1       0.78      0.80      0.79    148133

    accuracy                           0.79    296265
   macro avg       0.79      0.79      0.79    296265
weighted avg       0.79      0.79      0.79    296265

test score : 0.7895195179990887


# RandomOversampling  + 질병 1, 2, 3 비율 => 1 : 1 : 1

In [None]:
# Disease 

df2 = pd.read_csv('/content/drive/MyDrive/health/data/data(over_dis_vs_nor).csv', encoding='cp949') # 1~3 비율 맞춰서 오버샘플링 : 스모트
df2

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS,Dis
0,0,1,90,50,86,20.1,2,1
1,0,1,120,80,90,27.0,3,1
2,0,1,100,60,112,31.8,2,1
3,0,1,170,120,86,23.3,2,1
4,0,1,135,80,104,35.0,2,1
...,...,...,...,...,...,...,...,...
1481319,1,21,130,90,185,26.3,3,1
1481320,1,18,140,100,120,28.8,2,1
1481321,0,12,100,60,101,23.0,3,1
1481322,1,20,134,72,168,23.1,3,1


In [None]:
# 1~4 정답 레이블 삭제
df2.drop(['DIS'], axis=1, inplace=True)
df2

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,Dis
0,0,1,90,50,86,20.1,1
1,0,1,120,80,90,27.0,1
2,0,1,100,60,112,31.8,1
3,0,1,170,120,86,23.3,1
4,0,1,135,80,104,35.0,1
...,...,...,...,...,...,...,...
1481319,1,21,130,90,185,26.3,1
1481320,1,18,140,100,120,28.8,1
1481321,0,12,100,60,101,23.0,1
1481322,1,20,134,72,168,23.1,1


In [None]:
df2.notnull().sum()

SEX      1481324
BTH_G    1481324
SBP      1481324
DBP      1481324
FBS      1481324
BMI      1481324
Dis      1481324
dtype: int64

In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1481324 entries, 0 to 1481323
Data columns (total 7 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   SEX     1481324 non-null  int64  
 1   BTH_G   1481324 non-null  int64  
 2   SBP     1481324 non-null  int64  
 3   DBP     1481324 non-null  int64  
 4   FBS     1481324 non-null  int64  
 5   BMI     1481324 non-null  float64
 6   Dis     1481324 non-null  int64  
dtypes: float64(1), int64(6)
memory usage: 79.1 MB


In [None]:
# Over sampling data
# split data

from sklearn.model_selection import train_test_split

# 전체 데이터 split
X_up_train, X_up_test, y_up_train, y_up_test = train_test_split(df2.loc[:, df2.columns != 'Dis'],df2['Dis'],  stratify=df2['Dis'], shuffle=True, test_size=0.2, random_state=11)

# X_up_test,X_up_val,y_up_test,y_up_val=train_test_split(X_up_test,y_up_test,test_size=0.88, shuffle=True) # validation은 사용하지 않음


print(X_up_train.shape)
print(y_up_train.shape)
# print(X_up_val.shape) # validation set은 사용하지 않음
# print(y_up_val.shape) # 역시
print(X_up_test.shape)
print(y_up_test.shape)

(1185059, 6)
(1185059,)
(296265, 6)
(296265,)


In [None]:
from sklearn.linear_model import LogisticRegression # 로지스틱 회귀

LR = LogisticRegression() # 로지스틱 회귀 분석
LR.fit(X_up_train,y_up_train)

print("train score : {}".format(LR.score(X_up_train, y_up_train)))

predict_y = LR.predict(X_up_test)

from sklearn.metrics import classification_report
print(classification_report(y_up_test, LR.predict(X_up_test)))
print("test score : {}".format(LR.score(X_up_test, y_up_test)))

train score : 0.8071505300579971
              precision    recall  f1-score   support

           0       0.81      0.80      0.81    148132
           1       0.80      0.81      0.81    148133

    accuracy                           0.81    296265
   macro avg       0.81      0.81      0.81    296265
weighted avg       0.81      0.81      0.81    296265

test score : 0.8064773091657806
