In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pylab as plt
import scipy as sp
import scipy.stats as stats

In [3]:
"""
pd.crosstab()

범주형 변수로 되어있는 요인(factors)별로 
교차분석(cross tabulations)을 통해
행, 열 요인 기준 별로 빈도를 세어서 도수분포표(frequency table), 
교차표(contingency table)를 생성하게 해주는 method


pd.crosstab(
index= : row(index)그룹,
columns = : 행 그룹,
rownames = : 행(index)이름,
colnames = : 열이름,
margins = : row, col에 margin 추가,
normalize = : 총합으로 각 value를 나누기
)
"""

In [3]:
# making df
df = pd.DataFrame(
    {
        'id': ['id1', 'id1', 'id1', 'id2', 'id2', 'id3'],
        'fac_1': ['a', 'a', 'a', 'b', 'b', 'b'],
        'fac_2': ['d', 'd', 'd', 'c', 'c', 'd']
    }
)
df

Unnamed: 0,fac_1,fac_2,id
0,a,d,id1
1,a,d,id1
2,a,d,id1
3,b,c,id2
4,b,c,id2
5,b,d,id3


In [16]:
df.groupby(["fac_1", "fac_2"]).count().unstack("fac_2", fill_value=0)

Unnamed: 0_level_0,id,id
fac_2,c,d
fac_1,Unnamed: 1_level_2,Unnamed: 2_level_2
a,0,3
b,2,1


In [6]:
# (1) 교차표(contingency table, frequency table) 만들기 
# pd.crosstab(index, columns)
pd.crosstab(df.fac_1, df.fac_2)

fac_2,c,d
fac_1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0,3
b,2,1


In [17]:
pd.crosstab(df.id, df.fac_1)

fac_1,a,b
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id1,3,0
id2,0,2
id3,0,1


In [24]:
df.groupby(["id", "fac_1"]).size().unstack(level=1, fill_value=0)

fac_1,a,b
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id1,3,0
id2,0,2
id3,0,1


In [72]:
pd.crosstab(df.id, df.fac_2)

fac_2,c,d
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id1,0,3
id2,2,0
id3,0,1


In [74]:
pd.pivot_table(df, index="id", columns="fac_2", 
               aggfunc="size", fill_value=0)

fac_2,c,d
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id1,0,3
id2,2,0
id3,0,1


In [48]:
# (2) Multi-index, Multi-level로 교차표 만들기 
# pd.crosstab([id1, id2], [col1, col2])
pd.crosstab(index=df.id, columns=[df.fac_1, df.fac_2])

fac_1,a,b,b
fac_2,d,c,d
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
id1,3,0,0
id2,0,2,0
id3,0,0,1


In [51]:
pd.pivot_table(df, index="id", columns=["fac_1", "fac_2"],
               aggfunc="size", fill_value=0)

fac_1,a,b,b
fac_2,d,c,d
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
id1,3,0,0
id2,0,2,0
id3,0,0,1


In [70]:
df.groupby(["id", "fac_1", "fac_2"]).size().unstack(level=[1, 2]).fillna(0)

fac_1,a,b,b
fac_2,d,c,d
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
id1,3.0,0.0,0.0
id2,0.0,2.0,0.0
id3,0.0,0.0,1.0


In [99]:
pd.crosstab(index=[df.fac_1, df.fac_2], columns=df.id)

Unnamed: 0_level_0,id,id1,id2,id3
fac_1,fac_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,d,3,0,0
b,c,0,2,0
b,d,0,0,1


In [96]:
pd.pivot_table(df, index=["fac_1", "fac_2"], columns="id", 
               aggfunc="size", fill_value=0)

Unnamed: 0_level_0,id,id1,id2,id3
fac_1,fac_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,d,3,0,0
b,c,0,2,0
b,d,0,0,1


In [86]:
df.groupby(["fac_1", "fac_2", "id"]).size().unstack(level=2,
                                                    fill_value=0)

Unnamed: 0_level_0,id,id1,id2,id3
fac_1,fac_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,d,3,0,0
b,c,0,2,0
b,d,0,0,1


In [89]:
# (3) 교차표의 행 이름, 열 이름 부여 : pd.crosstab(rownames=, colnames=])
pd.crosstab(index=df.id, columns=[df.fac_1, df.fac_2], 
            rownames=["ch_id"], colnames=["f1", "f2"])

f1,a,b,b
f2,d,c,d
ch_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
id1,3,0,0
id2,0,2,0
id3,0,0,1


In [102]:
# (4) 교차표의 행 합, 열 합 추가하기 : pd.crosstab(margins=True)
pd.crosstab(df.id, [df.fac_1, df.fac_2], margins=True)

fac_1,a,b,b,All
fac_2,d,c,d,Unnamed: 4_level_1
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
id1,3,0,0,3
id2,0,2,0,2
id3,0,0,1,1
All,3,2,1,6


In [105]:
pd.crosstab(df.fac_1, df.fac_2, margins=True)

fac_2,c,d,All
fac_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0,3,3
b,2,1,3
All,2,4,6


In [119]:
# 구성비율로 교차표 만들기 : pd.crosstab(normalize=True)
pd.crosstab(df.id, [df.fac_1, df.fac_2], normalize=True)

fac_1,a,b,b
fac_2,d,c,d
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
id1,0.5,0.0,0.0
id2,0.0,0.333333,0.0
id3,0.0,0.0,0.166667


In [116]:
pd.crosstab(df.fac_1, df.fac_2, margins=True, normalize=True)

fac_2,c,d,All
fac_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0.0,0.5,0.5
b,0.333333,0.166667,0.5
All,0.333333,0.666667,1.0
