# Final Voting ensemble Model

In [1]:
! pip install xgboost



In [2]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import json
import ast

# 현재 경로 불러오기 = os.getcwd()
path = os.getcwd()
print(path)

c:\Users\medai03\Desktop\bp\Training (240119)


In [3]:
def read_csv(file_path):
    try:
        df = pd.read_csv(file_path,encoding='cp949')
        return df
    except FileNotFoundError:
        print(f"파일을 찾을 수 없습니다: {file_path}")
        return None
    except Exception as e:
        print(f"CSV 파일 읽기 오류: {e}")
        return None

> 최종 전처리된 csv 사용(Training)
- v1: 중복이 미포함된 데이터 사용 [129 rows × 15 columns]
- v2: 중복이 포함된 데이터를 사용 [316 rows × 15 columns]

In [4]:
# merged_df=read_csv('(Trainset)preprocessed_data_v1.csv')
merged_df=read_csv('(Trainset)preprocessed_data_v2.csv')


In [5]:
merged_df

Unnamed: 0,pat_id,pat_age,pat_sex(b),drinking,smoking,hbp_h,hbp_p,sbp,dbp,spo2,temp,weight,height,glucose,pulse
0,4,31.0,0.0,0,0,0,0,148.500000,103.500000,99.900000,37.250000,86.966667,181.533333,100.000000,98.5
1,4,31.0,0.0,0,0,0,0,129.250000,96.500000,96.000000,37.366667,71.200000,174.700000,108.000000,86.0
2,4,31.0,0.0,0,0,0,0,129.250000,96.500000,98.658333,37.366667,71.200000,173.400000,108.000000,86.0
3,4,31.0,0.0,0,0,0,0,129.250000,96.500000,97.450000,37.366667,71.200000,174.700000,108.000000,86.0
4,4,31.0,0.0,0,0,0,0,128.333333,78.333333,99.866667,37.533333,71.200000,174.700000,110.333333,64.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,192,34.0,0.0,1,0,0,0,99.000000,63.000000,98.000000,36.900000,66.400000,160.000000,101.000000,64.0
312,192,34.0,0.0,1,0,0,0,120.000000,81.000000,97.000000,36.700000,67.400000,160.300000,98.000000,69.0
313,193,20.0,0.0,1,0,0,0,119.000000,69.000000,98.000000,37.200000,44.700000,156.600000,88.000000,87.0
314,194,47.0,1.0,1,1,0,1,171.000000,125.000000,97.000000,36.800000,97.600000,174.100000,149.000000,78.0


> Voting Model

In [6]:
# 모델
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
# 전처리
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, RobustScaler
# 평가 및 유틸
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.utils.class_weight import compute_class_weight
from joblib import dump


# 데이터 불러오기 및 전처리
col = ['height', 'weight', 'dbp', 'glucose', 'pulse', 'sbp', 'spo2', 'temp', 'pat_age', 'pat_sex(b)','drinking', 'smoking']

df = merged_df[col].copy()  

# 고혈압 상태 레이블 생성 함수에서 이진 분류 적용
def label_hypertension(row):
    if row['sbp'] >= 120 or row['dbp'] >= 80:
        return 1  # 고혈압
    else:
        return 0  # 정상

# 고혈압 상태 레이블 적용 및 NaN 값 제거
df['hypertension_stage'] = df.apply(label_hypertension, axis=1)
df.dropna(subset=['hypertension_stage'], inplace=True)  # inplace=True를 사용하여 변경사항을 원본에 적용

# 데이터와 레이블 분리
X = df.drop('hypertension_stage', axis=1)
y = df['hypertension_stage'].astype(int)

# 수치형 컬럼 지정  
numeric_features = ['height', 'weight','glucose', 'pulse', 'spo2', 'temp', 'pat_age']

# 범주형 컬럼 지정
categorical_features = ['pat_sex(b)','drinking', 'smoking']

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=20)),  # 결측치를 k-최근접 이웃의 평균값으로 채움
    ('scaler', StandardScaler())  # 데이터 스케일링
])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # 결측치를 가장 빈번한 값으로 채움
    ('onehot', OneHotEncoder())  # 원-핫 인코딩
])

# 전처리기와 모델 정의
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Voting 모델 구성
voting_model = VotingClassifier(
    estimators=[
        ('xgb', XGBClassifier(n_estimators=90, learning_rate=0.03, max_depth=1, random_state=1)),
        ('lr', LogisticRegression(penalty=None, fit_intercept=True, solver='saga', multi_class='multinomial', max_iter=1000, random_state=1)),
        ('rf', RandomForestClassifier(n_estimators=400, random_state=1 ))
    ],
    voting='soft'
)

# 파이프라인 정의
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', voting_model)])

# KFold 설정
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 성능 지표를 저장할 리스트 초기화
cv_accuracy_scores = []
cv_precision_scores = []
cv_recall_scores = []
cv_f1_scores = []
cv_log_loss_scores = []

# 학습 곡선
def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
        plt.figure()
        plt.title(title)
        plt.xlabel("Training examples")
        plt.ylabel("Score")

        train_sizes, train_scores, test_scores = learning_curve(
            estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
        
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.grid()

        plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                        train_scores_mean + train_scores_std, alpha=0.1, color="r")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                        test_scores_mean + test_scores_std, alpha=0.1, color="g")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

        plt.legend(loc="best")
    #     return plt

# k-fold 교차 검증
for train_index, test_index in cv.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

   # 모델 학습
    pipeline.fit(X_train_fold, y_train_fold)
    
    # 테스트 데이터에 대한 예측 및 예측 확률
    y_pred_fold = pipeline.predict(X_test_fold)
    y_prob_fold = pipeline.predict_proba(X_test_fold)

    # 확률 값의 합이 각 샘플에 대해 1인지 검증
    y_prob_sum = np.sum(y_prob_fold, axis=1)
    if not np.allclose(y_prob_sum, np.ones_like(y_prob_sum)):
        print("경고: y_prob_fold의 합이 1이 아닌 샘플이 있습니다.")
    
    # 성능 지표 계산 및 저장
    cv_accuracy_scores.append(accuracy_score(y_test_fold, y_pred_fold))
    cv_precision_scores.append(precision_score(y_test_fold, y_pred_fold,  zero_division=0))
    cv_recall_scores.append(recall_score(y_test_fold, y_pred_fold,  zero_division=0))
    cv_f1_scores.append(f1_score(y_test_fold, y_pred_fold))
    cv_log_loss_scores.append(log_loss(y_test_fold, y_prob_fold))

    # # # 혼동 행렬 시각화
    # cm = confusion_matrix(y_test_fold, y_pred_fold, labels=pipeline.named_steps['model'].classes_)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipeline.named_steps['model'].classes_)
    # disp.plot(cmap='Reds')
    # plt.title('Confusion Matrix')
    # plt.show()

    # # 학습 곡선 시각화
    # title = "Learning Curves (XGBOOST)"
    # plot_learning_curve(pipeline, title, X_train_fold, y_train_fold, cv=cv.split(X_train_fold,y_train_fold), n_jobs=-1)
    # plt.show()

    
# 평균 성능 지표 출력
print(f"Average CV Accuracy: {np.mean(cv_accuracy_scores):.4f}")
print(f"Average CV Precision: {np.mean(cv_precision_scores):.4f}")
print(f"Average CV Recall: {np.mean(cv_recall_scores):.4f}")
print(f"Average CV F1 Score: {np.mean(cv_f1_scores):.4f}")
print(f"Average CV Log Loss: {np.mean(cv_log_loss_scores):.4f}")



Average CV Accuracy: 0.7976
Average CV Precision: 0.8413
Average CV Recall: 0.8720
Average CV F1 Score: 0.8554
Average CV Log Loss: 0.4668




> 모델 저장

In [8]:
# 모델 저장 코드
dump(pipeline, '(Trainset)voting_model_v2.joblib')

['(Trainset)voting_model_v2.joblib']