# 뇌졸증 데이터 활용하여 로지스틱 회귀분석해보기

In [30]:
import os 
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import statsmodels.api as sm
import matplotlib.pyplot as plt
import itertools
import time

# Feature 설명
- hypertension : 고혈압 여부
- avg_glucose_level : 혈중 포도당 수치

In [179]:
stroke = pd.read_csv('./data/healthcare-dataset-stroke-data.csv')
stroke.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [143]:
(stroke['stroke']).value_counts()

0    4861
1     249
Name: stroke, dtype: int64

- 기존 데이터의 결과값으로는 5.12%의 인원이 뇌졸증 환자임.

In [144]:
# 의미 없는 변수 제거
stroke_p = stroke.dropna().drop(['id', 'work_type', 'Residence_type', 'ever_married'], axis=1, inplace=False)
stroke_p.head()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,105.92,32.5,never smoked,1
3,Female,49.0,0,0,171.23,34.4,smokes,1
4,Female,79.0,1,0,174.12,24.0,never smoked,1
5,Male,81.0,0,0,186.21,29.0,formerly smoked,1


In [145]:
# 상수항 추가
stroke_p = sm.add_constant(stroke_p, has_constant='add')
stroke_p.head()

Unnamed: 0,const,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,1.0,Male,67.0,0,1,228.69,36.6,formerly smoked,1
2,1.0,Male,80.0,0,1,105.92,32.5,never smoked,1
3,1.0,Female,49.0,0,0,171.23,34.4,smokes,1
4,1.0,Female,79.0,1,0,174.12,24.0,never smoked,1
5,1.0,Male,81.0,0,0,186.21,29.0,formerly smoked,1


In [150]:
stroke_p = pd.get_dummies(stroke_p).head()

In [151]:
stroke_p.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 5
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   const                           5 non-null      float64
 1   age                             5 non-null      float64
 2   hypertension                    5 non-null      int64  
 3   heart_disease                   5 non-null      int64  
 4   avg_glucose_level               5 non-null      float64
 5   bmi                             5 non-null      float64
 6   stroke                          5 non-null      int64  
 7   gender_Female                   5 non-null      uint8  
 8   gender_Male                     5 non-null      uint8  
 9   gender_Other                    5 non-null      uint8  
 10  smoking_status_Unknown          5 non-null      uint8  
 11  smoking_status_formerly smoked  5 non-null      uint8  
 12  smoking_status_never smoked     5 non-nu

In [161]:
stroke_p.head()

Unnamed: 0,const,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.0,67.0,0,1,228.69,36.6,1,0,1,0,0,1,0,0
2,1.0,80.0,0,1,105.92,32.5,1,0,1,0,0,0,1,0
3,1.0,49.0,0,0,171.23,34.4,1,1,0,0,0,0,0,1
4,1.0,79.0,1,0,174.12,24.0,1,1,0,0,0,0,1,0
5,1.0,81.0,0,0,186.21,29.0,1,0,1,0,0,1,0,0


In [162]:
stroke_c = stroke_p.dropna().drop(['gender_Other', 'smoking_status_Unknown'], axis=1, inplace=False)

In [163]:
stroke_c.head()

Unnamed: 0,const,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.0,67.0,0,1,228.69,36.6,1,0,1,1,0,0
2,1.0,80.0,0,1,105.92,32.5,1,0,1,0,1,0
3,1.0,49.0,0,0,171.23,34.4,1,1,0,0,0,1
4,1.0,79.0,1,0,174.12,24.0,1,1,0,0,1,0
5,1.0,81.0,0,0,186.21,29.0,1,0,1,1,0,0


### 설명변수(X), 타겟변수(Y) 분리 및 학습데이터와 평가데이터

In [172]:
# stroke 여부 : 1 or 0
feature_columns = stroke_c.columns.difference(['stroke'])
x = stroke_c[feature_columns]
y = stroke_c['stroke']

In [173]:
train_x, test_x, train_y, test_y = train_test_split(x, y, stratify=y, train_size=0.7, test_size=0.3, random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(3, 11) (2, 11) (3,) (2,)


In [178]:
### 로지스틱 회귀모형 모델링 y=f(x)
# 로지스틱 모형 적합
model = sm.Logit(train_y, train_x) 
result = model.fit(method='newton')

PerfectSeparationError: Perfect separation detected, results not available