# 분류 : 로지스틱 회귀분석

In [1]:
import pandas as pd
import numpy as np
from statsmodels.api import Logit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
df = pd.read_csv('../datasets/iris.csv')
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [3]:
df['Species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [4]:
df['is_setosa'] = (df["Species"] == "setosa") + 0
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,is_setosa
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1


In [6]:
model = Logit(endog=df['is_setosa'], exog=df.iloc[:, :2]).fit()

Optimization terminated successfully.
         Current function value: 0.036374
         Iterations 11


In [7]:
model.params

Sepal.Length    -7.529945
Sepal.Width     13.130734
dtype: float64

In [8]:
model.pvalues

Sepal.Length    0.000828
Sepal.Width     0.000989
dtype: float64

In [9]:
model.tvalues

Sepal.Length   -3.343109
Sepal.Width     3.293594
dtype: float64

In [11]:
pred = model.predict(df.iloc[:3, :2])
pred

0    0.999477
1    0.923824
2    0.998678
dtype: float64

In [13]:
(pred > 0.5) + 0

0    1
1    1
2    1
dtype: int32

In [14]:
model = LogisticRegression(random_state=123)
model.fit(X=df.iloc[:, :2], y=df['is_setosa'])
model

In [15]:
model.coef_

array([[-3.38829757,  3.1645277 ]])

In [18]:
model.intercept_

array([8.32330389])

In [20]:
pred = model.predict_proba(df.iloc[:3, :2])
pred = pred[:, 1]
pred

array([0.89272024, 0.77104635, 0.92586179])

In [21]:
(pred > 0.5) + 0    # 경계값, threshold를 0.5로 설정 -> 0.5 이상이면 1이라는 의미

array([1, 1, 1])

In [22]:
pred = model.predict_proba(df.iloc[:, :2])
pred = pred[:, 1]
pred[:10]

array([0.89272024, 0.77104635, 0.92586179, 0.92738323, 0.94126096,
       0.91436651, 0.97058885, 0.89484454, 0.93034007, 0.82210603])

In [23]:
from sklearn.metrics import roc_auc_score

In [24]:
roc_auc_score(y_true=df['is_setosa'], y_score=pred)

1.0

In [31]:
accuracy_score(y_true=df['is_setosa'], y_pred=(pred > 0.9) + 0)

KeyError: 'is_setosa'

### 1. 독립변수를 혈압, 혈당, BMI, 인슐린으로 하고 종속변수를 당뇨 여부로 할 경우 분류 정확도는 얼마인가?

diabetes.csv 파일 사용
statsmodels 함수 사용
데이터는 학습:평가 = 8:2로 분리 후 계산
Seed는 123

In [28]:
df = pd.read_csv('../datasets/diabetes.csv')
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [29]:
from sklearn.model_selection import train_test_split

In [30]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

In [39]:
model = Logit(endog=df_train['Outcome'], exog=df_train.iloc[:, [2, 1, 5, 4]]).fit()

Optimization terminated successfully.
         Current function value: 0.626579
         Iterations 5


In [44]:
pred = model.predict(df_test.iloc[:, [2, 1, 5, 4]])
pred[:4]

  pred[:4]


236    0.462956
395    0.507051
36     0.359735
210    0.314389
dtype: float64

In [46]:
pred_class = (pred > 0.5) + 0
pred_class[:4]

  pred_class[:4]


236    0
395    1
36     0
210    0
dtype: int32

In [47]:
accuracy_score(y_pred=pred_class, y_true=df_test['Outcome'])

0.7012987012987013

### 2. 독립변수를 혈당, BMI, 나이로 하고 종속변수를 당뇨 여부로 할 경우 나이의 승산비는 얼마인가?

diabetes.csv 파일 사용
statsmodels 함수 사용

1. 0.02
2. 1.03
3. 1.05
4. 0.99

In [48]:
df = pd.read_csv('../datasets/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [54]:
model = Logit(endog=df['Outcome'],
              exog=df.loc[:, ['Glucose', 'BMI', 'Age']]).fit()
model.params

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


Glucose    0.009368
BMI       -0.035639
Age       -0.012898
dtype: float64

In [56]:
round(np.exp(model.params), 2)

Glucose    1.01
BMI        0.96
Age        0.99
dtype: float64

### 3. 독립변수를 혈당, BMI, 나이로 하고 종속변수를 당뇨 여부로 할 경우 모델의 AUC는 얼마인가?

diabetes.csv 파일 사용
statsmodels 함수 사용

1. 0.56
2. 0.55
3. 0.54
4. 0.53

In [57]:
df = pd.read_csv('../datasets/diabetes.csv')
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [58]:
model = Logit(endog=df['Outcome'], exog=df.loc[:, ['Glucose', 'BMI', 'Age']]).fit()
model.params

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


Glucose    0.009368
BMI       -0.035639
Age       -0.012898
dtype: float64

In [60]:
pred = model.predict(df.loc[:, ['Glucose', 'BMI', 'Age']])[:5]
pred

0    0.387961
1    0.365506
2    0.615678
3    0.392087
4    0.336654
dtype: float64

In [64]:
roc_auc_score(y_true=df['Outcome'], y_score=pred)

ValueError: Found input variables with inconsistent numbers of samples: [768, 5]