# 新しいクラスターを作って分析する

- ``q01``クラスタ -> カラム名 : ``q01_clustered``
  - 40歳を境に若手／シニアにクラスター化する
  - クラスター1 : ``q01 < 40s``
  - クラスター2 : ``q02 >= 40s``
- ``q13-14``の相関クラスタ -> カラム名 : ``q13_clustered``
  - 女性比率が20%周辺は不満（Poor / Very Poor）、30%周辺は満足（Good / Very Good）が多い
  - クラスター1 : ``q13 < 25%`` かつ ``q14 == ["Very Poor" , "Poor"]``
  - クラスター2 : ``q13 > 25%`` かつ ``q14 == ["Very Good" , "Good"]``

In [None]:
import altair as alt
import pandas as pd
import scipy as sp
import titanite as ti

print(f"Altair {alt.__version__}")
print(f"Pandas {pd.__version__}")
print(f"SciPy {sp.__version__}")
print(f"Titanite {ti.__version__}")

In [None]:
f_cfg = "../sandbox/config.toml"
f_csv = "../data/test_data/prepared_data.csv"

d = ti.Data(read_from=f_csv, load_from=f_cfg)
config = d.config()
#config
data = d.read()
#data

In [None]:
c1, c2 = ti.core.group_hbar(data, x="q02", y="count()", color="q01_clustered")
c1 | c2

# ``q01_clustered``

In [None]:
header = "q01_clustered"
data[header] = "Others"

isT = data["q01"] < "40s"
data.loc[isT, header] = "Cluster1"

isT = data["q01"] >= "40s"
data.loc[isT, header] = "Cluster2"

data[header].value_counts()

# ``q01q02_clustered``


In [None]:
header = "q01q02_clustered"
data[header] = "Others"

is_q01 = data["q01"] < "40s"
is_q02 = data["q02"].isin(["Female"])
isT = is_q01 & is_q02
data.loc[isT, header] = "Cluster1"

is_q01 = data["q01"] < "40s"
is_q02 = data["q02"].isin(["Male"])
isT = is_q01 & is_q02
data.loc[isT, header] = "Cluster2"

data[header].value_counts()

# ``q13q14_clustered``

In [None]:
header = "q13q14_clustered"
data[header] = "Others"

is_q13 = data["q13"] < 25
is_q14 = data["q14"].isin(["Very Poor", "Poor"])
isT = is_q13 & is_q14
data.loc[isT, header] = "Cluster1"

is_q13 = data["q13"] > 25
is_q14 = data["q14"].isin(["Very Good", "Good"])
isT = is_q13 & is_q14
data.loc[isT, header] = "Cluster2"

data[header].value_counts()

In [None]:
alt.Chart(data).mark_point().encode(
    alt.X("q13_binned"),
    alt.Y("q14"),
    alt.Color("q13q14_clustered:N"),
    alt.Size("count()"),
)

In [None]:
def h(data: pd.DataFrame, x: str, color:str, title="No title"):

    base = alt.Chart(data).encode(
        alt.X(x)
    ).properties(
        title=title,
        width=300,
    )
    mark = base.mark_bar(tooltip=True, opacity=0.5).encode(
        alt.Y("count()"),
        alt.Color(color)
    )
    stack = base.mark_bar(tooltip=True, opacity=0.5).encode(
        alt.Y("count()").stack("normalize"),
        alt.Color(color)
    )

    text = base.mark_text(dy=10).encode(
        alt.Y("count()").stack("normalize"),
        alt.Text("count()"),
        alt.Color(color)
    )

    chart = mark | (stack + text)
    #chart = (mark + text) | stack
    return chart.interactive()

In [None]:
h(data, "q02", "q01_clustered", title="")

In [None]:
config

In [None]:
questions = config.get("questions")
for header in d.categorical_headers:
    key = header.split("_")[0]
    t = questions.get(key, "Could not get title.")
    h(data, header, "q01_clustered", title=t).display()
    # c1, c2 = ti.core.group_hbar(data, header, "count()", "q01_clustered", title=t)
    # (c1 | c2).display()
    h(data, header, "q01q02_clustered", title=t).display()
    h(data, header, "q13q14_clustered", title=t).display()
    print("=" * 80)