# 3차시 가설검정: Two Way ANOVA

## 01 이원 분산 분석(Two Way ANOVA) 개요

### 이원 분산 분석의 특징

- 수치형 종속변수 1개와 명목형 독립변수가 2개 일 때 실시하는 분석
- 두 요인이 서로 상호간에 영향을 주고 받으면서 나타나는 교호작용 효과(interaction effect) 확인 가능

### 가설 - 주요 효과(main effect)

- 귀무가설(H0): 집단 간 평균이 같음
- 대립가설(H1): 평균이 같지 않은 집단이 한 쌍 이상 존재

### 가설 - 교호작용 효과(interaction effect)

- 귀무가설(H0): 요인 간 교호작용이 없음
- 대립가설(H1): 요인 간 교호작용이 있음

## 02 데이터 소개

### 피마족 인디언 당뇨병 발병 데이터 - diabetes.csv

- 피마(Pima)족 인디언의 당뇨 발병 여부와 관련된 데이터
- 자료는 768명의 21세 이상 여성 데이터로만 구성되어 있음

## 03 주요 함수 및 메서드 소개

### 이원 분산 분석 - ols()

- statsmodels의 분산 분석을 수행하는 함수
- 이원 분산 분석에서는 각 요인의 교호작용을 확인하기 위해서 ':'를 사용

In [1]:
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [2]:
df = pd.read_csv("실습파일/diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
formula = "price ~ cut + color + cut:color"
model = ols(formula, data = df).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
cut,4.0,11041750000.0,2760436000.0,181.405097,1.095733e-154
color,6.0,25507040000.0,4251174000.0,279.370558,0.0
cut:color,24.0,1653455000.0,68893960.0,4.527442,1.00078e-12
Residual,53905.0,820270900000.0,15216970.0,,


## Q1 다음 중 종속변수를 registered, 독립변수를 season, holiday로 지정했을 때 분석 결과 해석으로 틀린 것은?

In [4]:
Q1 = pd.read_csv("실습파일/bike.csv")
Q1.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [5]:
formula = "registered ~ season + holiday + season:holiday"
model = ols(formula, data = Q1).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
season,1.0,6679598.0,6679598.0,301.051911,1.522103e-66
holiday,1.0,165077.1,165077.1,7.440086,0.006388913
season:holiday,1.0,27856.61,27856.61,1.255508,0.2625279
Residual,10882.0,241444700.0,22187.53,,


In [6]:
formula = "registered ~ C(season) + C(holiday) + C(season):C(holiday)"  # 이게 맞는 결과
model = ols(formula, data = Q1).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(season),3.0,10990190.0,3663398.0,168.072567,1.646504e-106
C(holiday),1.0,137039.0,137039.0,6.287195,0.01217579
C(season):C(holiday),3.0,87364.84,29121.61,1.336067,0.2606397
Residual,10878.0,237102600.0,21796.53,,


## Q2 다음 중 종속변수를 registered, 독립변수를 season, weather로 지정했을 때 분석 결과 해석으로 틀린 것은?

In [9]:
formula = "registered ~ C(season) + C(weather) + C(season):C(weather)"  # 이게 맞는 결과
model = ols(formula, data = Q1).fit()
anova_lm(model).round(3)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(season),3.0,10990190.0,3663398.174,170.414,0.0
C(weather),3.0,3216611.0,1072203.668,49.877,0.0
C(season):C(weather),9.0,376518.4,41835.373,1.946,0.041
Residual,10873.0,233737600.0,21497.065,,


## Q3 다음 중 종속변수를 BMI, 독립변수를 연령대와 임신여부로 지정했을 때 교호작용 효과의 p-value는?

1) 나이가 70세 이상이거나 BMI가 0인 것은 제외하시오.  
2) 연령대는 10대, 20대 등 10살 단위로 구분하시오. ex. 10->1, 29->2

In [17]:
Q3 = pd.read_csv("실습파일/diabetes.csv")
Q3

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [21]:
Q3 = Q3[(Q3["Age"] < 70) & (Q3["BMI"] > 0)]
Q3

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [24]:
Q3["preg"] = (Q3["Pregnancies"] > 0 ) + 0
Q3["Age_g"] = Q3["Age"] // 10 
Q3

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,preg,Age_g
0,6,148,72,35,0,33.6,0.627,50,1,1,5
1,1,85,66,29,0,26.6,0.351,31,0,1,3
2,8,183,64,0,0,23.3,0.672,32,1,1,3
3,1,89,66,23,94,28.1,0.167,21,0,1,2
4,0,137,40,35,168,43.1,2.288,33,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0,1,6
764,2,122,70,27,0,36.8,0.340,27,0,1,2
765,5,121,72,23,112,26.2,0.245,30,0,1,3
766,1,126,60,0,0,30.1,0.349,47,1,1,4


In [25]:
formula = "BMI ~ C(preg) + C(Age_g) + C(preg):C(Age_g)"  # 이게 맞는 결과
model = ols(formula, data = Q3).fit()
anova_lm(model).round(3)
# p-value : 0.131 -> 0.05 보다 크므로 귀무가설을 기각하지 못함. -> 교호작용이 없는 상태

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(preg),1.0,959.85,959.85,21.241,0.0
C(Age_g),4.0,1144.912,286.228,6.334,0.0
C(preg):C(Age_g),4.0,321.231,80.308,1.777,0.131
Residual,744.0,33619.523,45.188,,
