In [1]:
import itertools
import altair as alt
import pandas as pd
import titanite as ti
import scipy as sp

print(f"Altair {alt.__version__}")
print(f"Pandas {pd.__version__}")
print(f"SciPy {sp.__version__}")
print(f"Titanite {ti.__version__}")

Altair 5.0.1
Pandas 2.0.3
SciPy 1.11.1
Titanite 0.2.0


データを読み込む

In [12]:
f_cfg = "../sandbox/config.toml"
f_csv = "../data/test_data/prepared_data.csv"
d = ti.Data(read_from=f_csv, load_from=f_cfg)
data = d.read()
# data.info()

[32m2023-07-22 15:41:14.033[0m | [1mINFO    [0m | [36mtitanite.preprocess[0m:[36mcategorical_data[0m:[36m123[0m - [1mCategorize[0m
[32m2023-07-22 15:41:14.051[0m | [1mINFO    [0m | [36mtitanite.preprocess[0m:[36mbinned_data[0m:[36m229[0m - [1mBinned[0m


In [30]:
#data.dtypes

In [35]:
data["q01"].cat.categories

Index(['10s', '20s', '30s', '40s', '50s', '60s', '70s', '80s', '90s+',
       'Prefer not to answer'],
      dtype='object')

In [41]:
for h in data.columns:
    t = data[h].dtype
    if t == "category":
        c = data[h].cat.categories
        print(f"{h} = {c}")

q01 = Index(['10s', '20s', '30s', '40s', '50s', '60s', '70s', '80s', '90s+',
       'Prefer not to answer'],
      dtype='object')
q02 = Index(['Male', 'Female', 'Non-binary', 'Prefer to self-identify',
       'Prefer not to answer'],
      dtype='object')
q03 = Index(['Europe / North Europe', 'Europe / West Europe',
       'Europe / Central Europe', 'Europe / East Europe',
       'Europe / South Europe', 'Asia / Japan', 'Asia / Eastern Asia',
       'Asia / South-Eastern Asia', 'Asia / Southern Asia',
       'Asia / Central Asia', 'Asia / Western Asia', 'America / North America',
       'America / Central America', 'America / South America',
       'Oceania / Oceania', 'Africa / Northern Africa',
       'Africa / Western Africa', 'Africa / Middle Africa',
       'Africa / Eastern Africa', 'Africa / Southern Africa',
       'Prefer not to answer / Prefer not to answer'],
      dtype='object')
q04 = Index(['Europe / North Europe', 'Europe / West Europe',
       'Europe / Central Europe'

クロス集計してカイ二乗検定する

In [64]:
h0 = "q02"
h1 = "q03"
v = "count"

# data[h0].dtype
# data[h1].dtype

In [65]:
ctab = pd.crosstab(data[h0], data[h1])
#ctab.columns

クロス集計した結果をしてロングデータに変換する（グラフ作成のための準備）
- ``reset_index``するときに、カラムの型情報が抜けてしまった
- カラム名が同じなので、元データのカテゴリ型を使って、上書きする

In [69]:
melted = ctab.reset_index().melt(
    id_vars=h0,
    var_name=h1,
    #value_vars=h0,
    value_name=v,
    )
#ti.categorical_data(melted, category)
melted[h0] = melted[h0].astype(data[h0].dtype)
melted[h1] = melted[h1].astype(data[h1].dtype)
#melted["q03"] = melted["q03"].astype("category")
melted.info()
#ctab.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   q02     85 non-null     category
 1   q03     85 non-null     category
 2   count   85 non-null     int64   
dtypes: category(2), int64(1)
memory usage: 1.9 KB


In [77]:
from scipy.stats import chi2_contingency

def crosstab(data: pd.DataFrame, header: tuple):
    h0 = header[0]
    h1 = header[1]
    v = "count"

    # クロス集計してカイ二乗検定
    ctab = pd.crosstab(data[h0], data[h1])
    chi2 = chi2_contingency(ctab)

    # ロングデータに変換
    melted = ctab.reset_index().melt(id_vars=h0, var_name=h1, value_name=v)
    # 元データのカテゴリ型情報を付け直す
    melted[h0] = melted[h0].astype(data[h0].dtype)
    melted[h1] = melted[h1].astype(data[h1].dtype)

    # グラフを作成
    base = alt.Chart(melted).encode(
        alt.X(h1),
        alt.Y(h0),
    )

    mark = base.mark_rect().encode(
        alt.Color(v),
    )

    text = base.mark_text().encode(alt.Text(v))

    chart = (mark + text).properties(
        width=800,
        height=800,
    )
    return ctab, chi2, chart


In [78]:
ctab, chi2, chart = crosstab(data, ("q01", "q02"))

In [79]:
ctab.info()

<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 9 entries, 20s to Prefer not to answer
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Male                     9 non-null      int64
 1   Female                   9 non-null      int64
 2   Non-binary               9 non-null      int64
 3   Prefer to self-identify  9 non-null      int64
 4   Prefer not to answer     9 non-null      int64
dtypes: int64(5)
memory usage: 749.0 bytes


In [80]:
melted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   q01     45 non-null     category
 1   q02     45 non-null     category
 2   count   45 non-null     int64   
dtypes: category(2), int64(1)
memory usage: 1.1 KB


In [81]:
chart

In [82]:
chi2.statistic
chi2.pvalue
chi2.dof

32

クロス集計に使わないカラム

- 自由記述
- 感情分析の数値（極性と主観度）
- 回答
- 回答日時

In [83]:
ignored = [
        "q15",
        "q15_ja",
        "q15_polarity",
        "q15_subjectivity",
        "q16",
        "q16_ja",
        "q16_polarity",
        "q16_subjectivity",
        "q18",
        "q18_ja",
        "q18_polarity",
        "q18_subjectivity",
        "q20",
        "q20_ja",
        "q20_polarity",
        "q20_subjectivity",
        "q21",
        "q21_ja",
        "q21_polarity",
        "q21_subjectivity",
        "q22",
        "q22_ja",
        "q22_polarity",
        "q22_subjectivity",
        "response",
        "timestamp",
    ]

In [97]:
headers = []
for h in sorted(data.columns):
    if h not in ignored:
        headers.append(h)
# headers = [header for header in sorted(data.columns) if header not in ignored]

matches = list(itertools.combinations(headers, 2))
matches[:10]

[('q01', 'q02'),
 ('q01', 'q03'),
 ('q01', 'q03_regional'),
 ('q01', 'q03_subregional'),
 ('q01', 'q04'),
 ('q01', 'q04_regional'),
 ('q01', 'q04_subregional'),
 ('q01', 'q05'),
 ('q01', 'q06'),
 ('q01', 'q07')]

In [102]:
ctabs = []
chi2s = []
charts = []
for m in matches[:10]:
    ctab, chi2, chart = crosstab(data, m)
    ctabs.append(ctab)
    chi2s.append(chi2)
    charts.append(chart)

In [103]:
len(ctabs)
len(chi2s)
len(charts)

10

In [108]:
i = 4
display(ctabs[i])
display(chi2s[i].pvalue)
charts[i]

q04,Europe / North Europe,Europe / West Europe,Europe / Central Europe,Europe / East Europe,Europe / South Europe,Asia / Japan,Asia / Eastern Asia,Asia / South-Eastern Asia,Asia / Southern Asia,Asia / Central Asia,Asia / Western Asia,America / North America,America / Central America,America / South America,Oceania / Oceania,Africa / Eastern Africa,Africa / Southern Africa,Prefer not to answer / Prefer not to answer
q01,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20s,1,13,25,7,8,13,2,1,6,2,0,10,1,7,5,0,1,1
30s,5,17,15,0,17,8,3,2,3,1,0,7,0,5,0,2,0,2
40s,1,5,0,0,7,11,1,0,1,0,2,8,0,2,0,0,1,0
50s,1,3,2,0,5,8,0,1,0,0,0,5,1,0,1,0,0,0
60s,0,1,0,0,2,9,1,1,0,0,0,3,0,1,1,0,0,1
70s,0,0,1,0,0,1,0,0,0,0,0,2,0,1,0,0,0,0
80s,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
90s+,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0
Prefer not to answer,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


2.0893383885838474e-11