In [1]:
from src.data_loader import repo_root_from_cwd, load_features, load_catalog
from src.clustering import ClusterConfig, fit_clusters, cluster_profiles
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd

REPO_ROOT = repo_root_from_cwd()
X_core = load_features(REPO_ROOT, "X_core")
X_core.shape

(19925, 83)

In [2]:
cfg = ClusterConfig(method="kmeans", k=12, use_cosine=True, random_state=42)
result = fit_clusters(X_core, cfg)
labels = result["labels"]

print("silhouette:", result["silhouette"])
labels.value_counts().head()

silhouette: 0.10106109345896193


cluster
7     2037
4     2028
2     1971
8     1904
10    1887
Name: count, dtype: int64

In [3]:
OUT_LABELS = REPO_ROOT / "data" / "processed" / "clusters_kmeans_k12.parquet"
labels.to_frame("cluster").to_parquet(OUT_LABELS, index=True)
OUT_LABELS

PosixPath('/Users/jonaskorganas/coding_projects/streaming_similarity/data/processed/clusters_kmeans_k12.parquet')

In [4]:
profiles = cluster_profiles(X_core, labels, top_n=12)
out_summary = REPO_ROOT / "data" / "processed" / "cluster_summary.csv"
profiles.to_csv(out_summary, index=False)
out_summary

PosixPath('/Users/jonaskorganas/coding_projects/streaming_similarity/data/processed/cluster_summary.csv')

In [5]:
profiles.head(20)

Unnamed: 0,cluster,feature,mean
0,0,Type_TV Show,1.0
1,0,Year:2010_2020,0.817579
2,0,Genre:Other,0.61194
3,0,TVSeasons:2_3,0.44942
4,0,Country:United States,0.395522
5,0,TVSeasons:3_5,0.343284
6,0,NCast:6_10,0.295191
7,0,NCast:3_6,0.21393
8,0,NCast:10_20,0.192371
9,0,TVSeasons:5_10,0.178275


In [6]:
catalog = load_catalog(REPO_ROOT)
catalog["cluster"] = labels
catalog[["title", "platform", "type", "release_year", "cluster"]].sample(10)

Unnamed: 0,title,platform,type,release_year,cluster
6660,ZETA,Amazon Prime,Movie,2016,
7263,Wilder Napalm,Amazon Prime,Movie,1993,
13284,Blippi's School Supply Scavenger Hunt,Amazon Prime,Movie,2021,
7755,The Romance of Tiger and Rose,Amazon Prime,TV Show,2020,
12082,Breathtaking Scenes from Around the World,Amazon Prime,Movie,2012,
8197,TURN: Washington's Spies,Netflix,TV Show,2017,
11426,The Big Bad Wolf,Disney+,Movie,1934,
41,Chhota Bheem & Krishna: Mayanagari,Netflix,Movie,2011,
10643,Dan Cummins: Don't Wake The Bear,Amazon Prime,Movie,2017,
1897,All Together Now,Netflix,Movie,2020,


In [None]:
REPO_ROOT = repo_root_from_cwd()
catalog = load_catalog(REPO_ROOT)
labels = pd.read_parquet(REPO_ROOT / "data" / "processed" / "clusters_kmeans_k12.parquet")  # index = title_id

# # attach cluster labels by index alignment
# catalog = catalog.join(labels, how="left")

# Make sure labels is indexed by title_id too
if "title_id" in labels.columns:
    labels = labels.set_index("title_id")
catalog["cluster"] = labels["cluster"]  # aligns by index


catalog[["title", "platform", "type", "release_year", "cluster"]].sample(10, random_state=42)

Unnamed: 0,title,platform,type,release_year,cluster
17211,Garden Store Part 2: Deserter,Amazon Prime,Movie,2017,
9895,Isa Pa with Feelings,Netflix,Movie,2019,
15340,Fort Defiance,Amazon Prime,Movie,1951,
4441,My Friend Pinto,Netflix,Movie,2011,
5762,FETCH! With Ruff Ruffman,Amazon Prime,TV Show,2006,
17001,GoShogun: The Time Etrainger,Amazon Prime,Movie,1985,
15312,Kodachrome,Netflix,Movie,2018,
16066,Buddha,Netflix,TV Show,2013,
8165,Once In A Lifetime Sessions with Noel Gallagher,Netflix,Movie,2018,
17268,Air Force One,Netflix,Movie,1997,


In [8]:
summary = pd.read_csv(REPO_ROOT / "data" / "processed" / "cluster_summary.csv")
summary.query("cluster == 0").head(12)

Unnamed: 0,cluster,feature,mean
0,0,Type_TV Show,1.0
1,0,Year:2010_2020,0.817579
2,0,Genre:Other,0.61194
3,0,TVSeasons:2_3,0.44942
4,0,Country:United States,0.395522
5,0,TVSeasons:3_5,0.343284
6,0,NCast:6_10,0.295191
7,0,NCast:3_6,0.21393
8,0,NCast:10_20,0.192371
9,0,TVSeasons:5_10,0.178275


In [9]:
summary.groupby("cluster").head(5)

Unnamed: 0,cluster,feature,mean
0,0,Type_TV Show,1.0
1,0,Year:2010_2020,0.817579
2,0,Genre:Other,0.61194
3,0,TVSeasons:2_3,0.44942
4,0,Country:United States,0.395522
12,1,Type_Movie,1.0
13,1,NCast:6_10,1.0
14,1,Genre:Drama,0.445055
15,1,MovieMins:80_100,0.443223
16,1,Year:2010_2020,0.432234


In [10]:
fig_dir = REPO_ROOT / "outputs" / "figures"
fig_dir.mkdir(parents=True, exist_ok=True)

preview = summary.query("cluster == 0").head(12)

fig, ax = plt.subplots(figsize=(10, 4))
ax.axis("off")
tbl = ax.table(
    cellText=preview.values,
    colLabels=preview.columns,
    loc="center"
)

tbl.auto_set_font_size(False)
tbl.set_fontsize(10)
tbl.scale(1, 1.4)

out_path = fig_dir / "cluster0_top_features.png"
plt.savefig(out_path, dpi=200, bbox_inches="tight")
plt.close(fig)
out_path

PosixPath('/Users/jonaskorganas/coding_projects/streaming_similarity/outputs/figures/cluster0_top_features.png')

In [16]:
catalog = load_catalog(REPO_ROOT).reset_index()  # ensure title_id column
labels = pd.read_parquet(REPO_ROOT / "data" / "processed" / "clusters_kmeans_k12.parquet")

labels = labels.reset_index()  # bring title_id into column

catalog = catalog.merge(
    labels[["title_id", "cluster"]],
    on="title_id",
    how="left"
)

out_tables = REPO_ROOT / "outputs" / "tables"
out_tables.mkdir(parents=True, exist_ok=True)

examples = (
    catalog.reset_index()
          .loc[:, ["title_id", "title", "platform", "type", "release_year", "cluster"]]
          .dropna(subset=["cluster"])
          .astype({"cluster": int})
          .groupby("cluster", group_keys=False)
          .head(10)
)

examples.to_csv(out_tables / "cluster_examples.csv", index=False)
examples.head(15)

Unnamed: 0,title_id,title,platform,type,release_year,cluster
0,0004e66cc4db,The Long Goodbye: The Kara Tippetts Story,Netflix,Movie,2019,11
1,000634383d63,The Fog (1980),Amazon Prime,Movie,1980,2
2,0007715f4696,Mr. Robot,Amazon Prime,TV Show,2019,0
3,0010ab469850,Tick Tock,Amazon Prime,Movie,2000,10
4,001278d3255e,DuckTales The Movie: Treasure of the Lost Lamp,Disney+,Movie,1990,6
5,001280e10677,Color Crew All About Colors,Amazon Prime,TV Show,2018,0
6,00199170abee,Chatô: The King of Brazil,Netflix,Movie,2015,7
7,0019918dd2b6,Makoki: A Deadly Love,Amazon Prime,Movie,2019,3
8,001af1ed17ac,Alias Grace,Netflix,TV Show,2017,4
9,001d099c0ac0,American Guns: A History of US Firearms,Amazon Prime,TV Show,2017,4


In [17]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

FIG_DIR = REPO_ROOT / "outputs" / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

def save_table_png(df, out_path, title=None):
    # size scales with number of rows (avoids tiny unreadable tables)
    fig, ax = plt.subplots(figsize=(12, 0.6 + 0.35 * len(df)))
    ax.axis("off")
    if title:
        ax.set_title(title, pad=10)
    tbl = ax.table(cellText=df.values, colLabels=df.columns, loc="center")
    tbl.auto_set_font_size(False)
    tbl.set_fontsize(10)
    tbl.scale(1, 1.3)
    fig.tight_layout()
    fig.savefig(out_path, dpi=200, bbox_inches="tight")
    plt.close(fig)

# 1) cluster_summary head
summary = pd.read_csv(REPO_ROOT / "data" / "processed" / "cluster_summary.csv")
save_table_png(
    summary.head(20),
    FIG_DIR / "cluster_summary_head.png",
    title="Cluster summary (head)"
)

# 2) cluster_examples head (uses the examples dataframe you already created)
save_table_png(
    examples.head(20),
    FIG_DIR / "cluster_examples_head.png",
    title="Cluster examples (head)"
)

FIG_DIR

PosixPath('/Users/jonaskorganas/coding_projects/streaming_similarity/outputs/figures')