In [6]:
# !pip3 install pandas
# !pip3 install numpy
# !pip3 install sklearn
# !pip3 install category_encoders
# !pip3 install xgboost

In [20]:
import pandas as pd
import sklearn

In [65]:
data = pd.read_csv('/Users/hyegwan/codestates/project3/heart.csv')

In [66]:
data.info()
data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


(918, 12)

### Feature 설명

Age: 나이  

Sex: 성별  
- M - 남성 (Male)
- F - 여성 (Female)

ChestPainType: 가슴 통증 유형
- TA - 일반 협심증 (Typical Angina)
- ATA - 비정형 협심증 (Atypical Angina)
- NAP - 협심증이 아닌 통증 (Non-Anginal Pain)
- ASY - 무증상 (Asymptomatic)

RestingBP: 안정시 혈압 (Resting Blood Pressure) [mmHg]  

Cholesterol: 혈중 콜레스테롤 [mg/dl]  

FastingBS: 공복 혈당 (Fasting Blood Sugar)  
- 0 - 120 mg/dl 이하
- 1 - 120 mg/dl 초과  

RestingECG: 휴식 중 심전도 결과 (Resting ElectroCardioGraphic)  
- LVH -  좌심실비대
- Normal - 정상
- ST - ST-T 이상

MaxHR: 최대 심박수 (Maximum Heart Rate Achieved)  
- 60 ~ 202 사이 값  

ExerciseAngina: 활동으로 인한 협심증 (Exercise Induced Angina)
- Y - 있음 (Yes)
- N - 없음 (No)  

Oldpeak: 휴식 대비 활동으로 인한 ST 하강  (ST depression induced by exercise relative to rest) 

ST_Slope: 활동 중 ST 분절의 최고점 기울기(The slope of the peak exercise ST segment) 
- Up - 상승 (Upsloping)
- Flat - 평탄 (Flat)
- Down - 하강 (Downsloping)

HearDisease: 심장 질환 진단 여부
- 0 - 정상  (Normal)
- 1 - 심장 질환 진단 (Heart Disease)

In [67]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [68]:
from sklearn.model_selection import train_test_split

# data 복사
df = data.copy()

# train, test로 데이터 분리
train, test = train_test_split(df, test_size=0.2, random_state=2)

print(train.shape, test.shape)

(734, 12) (184, 12)


In [70]:
# data 복사
df = data.copy()

# train, test로 데이터 분리
train, test = train_test_split(df, test_size=0.2, random_state=2)

In [71]:
# 타겟 선정
target = 'HeartDisease'

# 타겟 분포
train['HeartDisease'].value_counts()

1    413
0    321
Name: HeartDisease, dtype: int64

In [73]:
# 피쳐와 타겟 분리
def divide_data(df):

    # target = 'HeartDisease'

    X = df.drop(columns = target)
    y = df[target]

    return X, y

X_train, y_train = divide_data(df)
X_test, y_test = divide_data(df)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(918, 11) (918,) (918, 11) (918,)


In [42]:
from sklearn.pipeline import Pipeline, make_pipeline
from category_encoders import OrdinalEncoder 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [94]:
# 기준 모델
def getBaseModel(df): 
    y_base = train['HeartDisease'] 
    major = y_base.mode()[0] 
    y_pred = [major] * len(y_base) 
    accuracy = accuracy_score(y_base, y_pred) 

    return accuracy

print('기준모델 정확도: ',getBaseModel(df))

기준모델 정확도:  0.5626702997275205


In [92]:
# 모델 학습 - Logistic Regression 
def fit(X_train, y_train):
    
    pipeline = make_pipeline(
        OrdinalEncoder(), 
        LogisticRegression()
    )

    dists = {
        'logisticregression__C': [0.01, 0.1, 1, 10, 100],
        'logisticregression__penalty':['l1','l2']
    }


    clf = RandomizedSearchCV(
        pipeline, 
        param_distributions = dists, 
        random_state = 2, 
        n_iter = 2, 
        cv = 3,
        scoring = 'f1', 
        verbose = 1, 
        n_jobs = -1
    )
    
    clf.fit(X_train, y_train)
    print("Optimal Hyperparameter:", clf.best_params_)
    print("f1 score:", clf.best_score_)

    return clf

In [93]:
clf = fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
Optimal Hyperparameter: {'logisticregression__penalty': 'l2', 'logisticregression__C': 0.01}
f1 score: 0.7954489954489955


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
3 fits failed out of a total of 6.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failu

In [78]:
y_test_pred = clf.best_estimator_.predict(X_test)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print('Test accuracy :', test_accuracy)
print('Test F1 :', test_f1)

Test accuracy : 0.8311546840958606
Test F1 : 0.8499515972894481
