# Health-Care-Data-Analysis

### - Modeling & Evaluating model performance (3) -

## Load Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import ipywidgets as widgets
from IPython.display import display
from IPython.display import set_matplotlib_formats

In [3]:
# 폰트
plt.rc("font",family = "Malgun Gothic")

# 선명하게 글씨체를 출력시킬때 
set_matplotlib_formats('retina')

# 음수 값 출력
plt.rc('axes',unicode_minus=False)

# 최대 출력 column수 지정 
pd.options.display.max_columns = 100 

In [4]:
df = pd.read_csv('health_real_fin.csv', encoding='utf-8')
df['성별'] = df['성별'].astype('object')
df['흡연상태'] = df['흡연상태'].astype('object')
df['단백뇨'] = df['단백뇨'].astype('object')

In [5]:
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.datasets import dump_svmlight_file
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix



In [6]:
dat = df.loc[:, ['가입자일련번호', '허리둘레', '수축기혈압', '이완기혈압', '식전혈당(공복혈당)', '총콜레스테롤', 'HDL콜레스테롤',
                 'LDL콜레스테롤', '혈색소', '단백뇨', '혈청크레아티닌', '(혈청지오티)AST', '(혈청지오티)ALT', '감마지티피',
                 '흡연상태', '성별', '신장', '체중','BMI', '나이', 'eGFR', '기초대사량']]

In [7]:
cell = pd.read_csv('cell_img.csv', encoding='utf-8')
# Mean, Max, SE는 서로 corr이 높아 분리하여 암 발병을 예측한다.

cell = cell.loc[:, ['Image_ID', 'user_ID', 'Diagnosis']]

# cell_mean = df.loc[:, df.columns.str.startswith('Mean')]
# cell_mean = common.join(cell_mean)
# cell_max = df.loc[:, df.columns.str.startswith('Max')]
# cell_max = common.join(cell_max)
# cell_se = df.loc[:, df.columns.str.startswith('SE')]
# cell_se = common.join(cell_se)

In [8]:
cols = cell.columns.to_list()
cols[1] = '가입자일련번호'
cell.columns = cols
cell = pd.merge(cell, dat, how='inner', on='가입자일련번호')
cell.drop(['Image_ID', '가입자일련번호'], axis=1, inplace=True)

In [9]:
cell['Diagnosis'] = np.where(cell['Diagnosis'] == 'M', 1, 0)
Y = cell['Diagnosis']
X = cell.iloc[:, 1:23]
X = pd.get_dummies(X)

## 암 예측 모델

#### XGBoost

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)

# features = X_train.columns.values.tolist()

# use DMatrix for xgbosot
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# set xgboost params
param = {
    'max_depth': 10,  # the maximum depth of each tree
    'eta': 0.05,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 4,
    'colsample_bytree': 0.85,
    'subsample': 0.85}  # the number of classes that exist in this datset
num_round = 500  # the number of training iterations

#------------- numpy array ------------------
# training and testing - numpy matrices
# bst = xgb.train(param, dtrain, num_round)
# preds = bst.predict(dtest)

watch_list = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(param, dtrain, num_round, evals=watch_list,
                 early_stopping_rounds=20)

# extracting most confident predictions
# preds = model.predict(dtest)
# best_preds = np.asarray([np.argmax(line) for line in preds])
# print("Numpy array precision:", precision_score(y_test, best_preds, average='macro'))

[0]	train-merror:0.041176	test-merror:0.041096
Multiple eval metrics have been passed: 'test-merror' will be used for early stopping.

Will train until test-merror hasn't improved in 20 rounds.
[1]	train-merror:0.035294	test-merror:0.041096


  if getattr(data, 'base', None) is not None and \


[2]	train-merror:0.035294	test-merror:0.041096
[3]	train-merror:0.035294	test-merror:0.041096
[4]	train-merror:0.029412	test-merror:0.054795
[5]	train-merror:0.029412	test-merror:0.054795
[6]	train-merror:0.029412	test-merror:0.054795
[7]	train-merror:0.023529	test-merror:0.054795
[8]	train-merror:0.023529	test-merror:0.054795
[9]	train-merror:0.023529	test-merror:0.054795
[10]	train-merror:0.023529	test-merror:0.054795
[11]	train-merror:0.023529	test-merror:0.054795
[12]	train-merror:0.023529	test-merror:0.054795
[13]	train-merror:0.023529	test-merror:0.054795
[14]	train-merror:0.023529	test-merror:0.054795
[15]	train-merror:0.023529	test-merror:0.054795
[16]	train-merror:0.023529	test-merror:0.054795
[17]	train-merror:0.023529	test-merror:0.054795
[18]	train-merror:0.023529	test-merror:0.054795
[19]	train-merror:0.023529	test-merror:0.054795
[20]	train-merror:0.023529	test-merror:0.054795
Stopping. Best iteration:
[0]	train-merror:0.041176	test-merror:0.041096



In [11]:
preds = model.predict(dtest, ntree_limit=model.best_ntree_limit)
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Numpy array precision:", precision_score(y_test, best_preds, average='macro'))
# print('Confusion Matrix: \n{}'.format(confusion_matrix(y_test, preds)))
from sklearn.metrics import classification_report
target_names = ['양성 종양', '악성 종양']
print(classification_report(y_test, best_preds, target_names=target_names, digits=3))

Numpy array precision: 0.9741379310344828
              precision    recall  f1-score   support

       양성 종양      0.948     1.000     0.973        55
       악성 종양      1.000     0.833     0.909        18

    accuracy                          0.959        73
   macro avg      0.974     0.917     0.941        73
weighted avg      0.961     0.959     0.958        73



#### GBM

In [12]:
gb_final = GradientBoostingClassifier(random_state = 1234,
                                learning_rate = 0.04, n_estimators = 300,
                                   max_depth = 7, min_samples_leaf = 2, min_samples_split = 4)
gb_final.fit(X_train, y_train)
y_pred = gb_final.predict(X_test)

# print('Accuracy on training set: {:.3f}'.format(gb_final.score(X_train, y_train)))
print('Accuracy on test set: {:.3f}'.format(gb_final.score(X_test, y_test)))
# print('Confusion Matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))
print(classification_report(y_test, y_pred, target_names=target_names, digits=3))

Accuracy on test set: 0.959
              precision    recall  f1-score   support

       양성 종양      0.948     1.000     0.973        55
       악성 종양      1.000     0.833     0.909        18

    accuracy                          0.959        73
   macro avg      0.974     0.917     0.941        73
weighted avg      0.961     0.959     0.958        73



#### RF

In [13]:
rf_final = RandomForestClassifier(random_state = 1234, 
                                max_depth = 5, n_estimators = 500, min_samples_split = 2, 
                                  min_samples_leaf = 1)
rf_final.fit(X_train, y_train)

y_pred = rf_final.predict(X_test)
print('Accuracy: {0:.3f}\n'.format(rf_final.score(X_test, y_test)))
# print('Confusion Matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))
print(classification_report(y_test, y_pred, target_names=target_names, digits=3))

Accuracy: 0.959

              precision    recall  f1-score   support

       양성 종양      0.948     1.000     0.973        55
       악성 종양      1.000     0.833     0.909        18

    accuracy                          0.959        73
   macro avg      0.974     0.917     0.941        73
weighted avg      0.961     0.959     0.958        73



#### Logistic Reg

In [14]:
softmax_reg = LogisticRegression(multi_class = 'multinomial',
                                solver='lbfgs', C=10, random_state=1234)
softmax_reg.fit(X_train, y_train)

y_pred = softmax_reg.predict(X_test)
print('Accuracy: {0:.3f}\n'.format(softmax_reg.score(X_test, y_test)))
# print('Accuracy: {0:.3f}\n'.format(softmax_reg.score(X_test, y_test)))
# print('Confusion Matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))
print(classification_report(y_test, y_pred, target_names=target_names, digits=3))

Accuracy: 0.863

              precision    recall  f1-score   support

       양성 종양      0.909     0.909     0.909        55
       악성 종양      0.722     0.722     0.722        18

    accuracy                          0.863        73
   macro avg      0.816     0.816     0.816        73
weighted avg      0.863     0.863     0.863        73





#### DT

In [15]:
tree_final = DecisionTreeClassifier(random_state = 1234,
                                  max_depth = 3, min_samples_split = 2, min_samples_leaf = 1)
tree_final.fit(X_train, y_train)

y_pred = tree_final.predict(X_test)
print('Accuracy: {0:.3f}\n'.format(tree_final.score(X_test, y_test)))
# print('Confusion Matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))
print(classification_report(y_test, y_pred, target_names=target_names, digits=3))

Accuracy: 0.932

              precision    recall  f1-score   support

       양성 종양      0.946     0.964     0.955        55
       악성 종양      0.882     0.833     0.857        18

    accuracy                          0.932        73
   macro avg      0.914     0.898     0.906        73
weighted avg      0.931     0.932     0.931        73

