# 12차시 범주형 변수 간의 독립성 검정(Chi-squared test)

## 01 독립성 검정 개요

### 독립성 검정의 특징

- 두 명목형 변수를 대상으로 실시하는 분석
- 독립 관점에서의 해석과 연관 관점에서의 해석이 존재
- 연속형 변수의 경우 명목형 변수로 변환 후 실시

### 가설

- 귀무가설(H0): 두 변수는 서로 독립임 (연관X)
- 대립가설(H1): 두 변수는 서로 독립이 아님 (연관O)

## 02 주요 함수 및 메서드 소개

### scipy - chi2_contingency

- scipy의 독립성 검정을 실시하는 함수
- 입력은 두 개의 명목형 변수의 각 원소의 빈도
- 출력은 검정통계량, p-value, 자유도, 기대도수 4개의 연산 결과가 튜플로 산출

In [25]:
import pandas as pd
from scipy.stats import chi2_contingency

In [4]:
df = pd.read_csv("강의자료/실습파일/financial_info_10k_persons.csv")
df.tail()

Unnamed: 0,ID,is_attrited,Age,Gender,Dependent_cnt,Edu_level,Marital_status,Income,Card,Period_m,Total_rel_cnt,Inactive_last_12m,Contacts_cnt_last_12m,Credit_limit,Total_trans_amt,Total_trans_cnt
9995,9996,1,36,M,2,Graduate,Married,$40K - $60K,Blue,18,3,1,3,7758.0,569,23
9996,9997,0,54,M,4,Graduate,Married,$60K - $80K,Blue,36,4,3,3,6905.0,1370,25
9997,9998,0,46,M,3,Uneducated,Single,$60K - $80K,Blue,36,5,1,2,5489.0,3215,64
9998,9999,0,43,M,3,Graduate,Unknown,$40K - $60K,Blue,36,2,3,3,4878.0,5021,84
9999,10000,0,45,F,3,Unknown,Single,Less than $40K,Blue,36,3,4,0,1438.3,4463,71


In [5]:
chi2_contingency(pd.crosstab(df["Gender"], df["Marital_status"]))

(4.093468963560284,
 0.2515464475739655,
 3,
 array([[ 392.8524, 2462.9028, 2067.3924,  392.8524],
        [ 346.1476, 2170.0972, 1821.6076,  346.1476]]))

In [6]:
stat, p, dof, e_val = chi2_contingency(pd.crosstab(df["Gender"],
                                                   df["Marital_status"]))
print(stat)
print(p)

4.093468963560284
0.2515464475739655


## Q1 고객의 학력이 고졸인 경우 성별과 이탈여부를 사용하여 독립성 검정을 실시했을 때 p-value는?
1) 연속성 수정을 적용하지 않음

In [7]:
Q1 = pd.read_csv("강의자료/실습파일/financial_info_10k_persons.csv")
Q1.tail()

Unnamed: 0,ID,is_attrited,Age,Gender,Dependent_cnt,Edu_level,Marital_status,Income,Card,Period_m,Total_rel_cnt,Inactive_last_12m,Contacts_cnt_last_12m,Credit_limit,Total_trans_amt,Total_trans_cnt
9995,9996,1,36,M,2,Graduate,Married,$40K - $60K,Blue,18,3,1,3,7758.0,569,23
9996,9997,0,54,M,4,Graduate,Married,$60K - $80K,Blue,36,4,3,3,6905.0,1370,25
9997,9998,0,46,M,3,Uneducated,Single,$60K - $80K,Blue,36,5,1,2,5489.0,3215,64
9998,9999,0,43,M,3,Graduate,Unknown,$40K - $60K,Blue,36,2,3,3,4878.0,5021,84
9999,10000,0,45,F,3,Unknown,Single,Less than $40K,Blue,36,3,4,0,1438.3,4463,71


In [8]:
Q1["Edu_level"].unique()

array(['High School', 'Uneducated', 'Doctorate', 'Unknown', 'Graduate',
       'Post-Graduate', 'College'], dtype=object)

In [10]:
Q1_HS = Q1[Q1["Edu_level"] == "High School"]
Q1_HS.tail()

Unnamed: 0,ID,is_attrited,Age,Gender,Dependent_cnt,Edu_level,Marital_status,Income,Card,Period_m,Total_rel_cnt,Inactive_last_12m,Contacts_cnt_last_12m,Credit_limit,Total_trans_amt,Total_trans_cnt
9972,9973,0,36,F,3,High School,Single,Unknown,Blue,23,6,3,4,2804.0,2812,77
9981,9982,0,52,F,2,High School,Single,Less than $40K,Blue,44,6,3,3,2971.0,4458,63
9982,9983,0,41,F,2,High School,Single,$40K - $60K,Blue,36,4,5,3,1561.0,4391,67
9987,9988,0,43,F,3,High School,Married,Less than $40K,Blue,28,3,3,2,2590.0,3391,66
9994,9995,0,38,F,1,High School,Married,$40K - $60K,Blue,19,1,1,2,5559.0,5344,82


In [13]:
chi2_contingency(pd.crosstab(Q1_HS["Gender"],
                             Q1_HS["is_attrited"]),
                correction = False)[1]

0.04605478481460392

## Q2 성별에 따른 카드 등급은 서로 관련이 있는지 독립성 검정을 실시하고 해당 검정 결과의 검정 통계량은 얼마인가?

In [16]:
Q2 = pd.read_csv("강의자료/실습파일/financial_info_10k_persons.csv")
chi2_contingency(pd.crosstab(Q2["Gender"],
                             Q2["Card"]))

(66.45702170623164,
 2.4470625495771945e-14,
 3,
 array([[4962.486 ,   59.0076,   10.632 ,  283.8744],
        [4372.514 ,   51.9924,    9.368 ,  250.1256]]))

## Q3 최근 12개월의 이용 실적 중 3개월 이상 사용 실적이 없는 것과 고객 이탈의 관계가 있는지 독립성 검정을 실시한 결과로 옳은 것은?

In [26]:
Q3 = pd.read_csv("강의자료/실습파일/financial_info_10k_persons.csv")
Q3["is_Inactive_last_12m"] = (Q3["Inactive_last_12m"] >= 3) + 0
Q3.tail()

Unnamed: 0,ID,is_attrited,Age,Gender,Dependent_cnt,Edu_level,Marital_status,Income,Card,Period_m,Total_rel_cnt,Inactive_last_12m,Contacts_cnt_last_12m,Credit_limit,Total_trans_amt,Total_trans_cnt,is_Inactive_last_12m
9995,9996,1,36,M,2,Graduate,Married,$40K - $60K,Blue,18,3,1,3,7758.0,569,23,0
9996,9997,0,54,M,4,Graduate,Married,$60K - $80K,Blue,36,4,3,3,6905.0,1370,25,1
9997,9998,0,46,M,3,Uneducated,Single,$60K - $80K,Blue,36,5,1,2,5489.0,3215,64,0
9998,9999,0,43,M,3,Graduate,Unknown,$40K - $60K,Blue,36,2,3,3,4878.0,5021,84,1
9999,10000,0,45,F,3,Unknown,Single,Less than $40K,Blue,36,3,4,0,1438.3,4463,71,1


In [27]:
chi2_contingency(pd.crosstab(Q3["is_Inactive_last_12m"],
                            Q3["is_attrited"]))

(210.74834292621756,
 9.431916381955241e-48,
 1,
 array([[4606.4112,  869.5888],
        [3805.5888,  718.4112]]))