In [1]:
from src.data_loader import repo_root_from_cwd, load_features, load_catalog
from src.clustering import ClusterConfig, fit_clusters, cluster_profiles

REPO_ROOT = repo_root_from_cwd()
X_core = load_features(REPO_ROOT, "X_core")

X_core.shape

(19925, 83)

In [2]:
cfg = ClusterConfig(method="kmeans", k=12, use_cosine=True, random_state=42)
result = fit_clusters(X_core, cfg)
labels = result["labels"]

print("silhouette:", result["silhouette"])
labels.value_counts().head()

silhouette: 0.0915506643090883


cluster
6     2279
8     2028
11    1914
7     1901
9     1728
Name: count, dtype: int64

In [3]:
OUT_LABELS = REPO_ROOT / "data" / "processed" / "clusters_kmeans_k12.parquet"
labels.to_frame("cluster").to_parquet(OUT_LABELS, index=True)
OUT_LABELS

PosixPath('/Users/jonaskorganas/coding_projects/streaming_similarity/data/processed/clusters_kmeans_k12.parquet')

In [4]:
profiles = cluster_profiles(X_core, labels, top_n=12)
out_summary = REPO_ROOT / "data" / "processed" / "cluster_summary.csv"
profiles.to_csv(out_summary, index=False)
out_summary

PosixPath('/Users/jonaskorganas/coding_projects/streaming_similarity/data/processed/cluster_summary.csv')

In [5]:
profiles.head(20)

Unnamed: 0,cluster,feature,mean
0,0,Type_Movie,1.0
1,0,MovieMins:60_80,0.942478
2,0,Year:2010_2020,0.576696
3,0,NCast:1_3,0.471239
4,0,Genre:Other,0.340708
5,0,Year:2020_2030,0.226401
6,0,Genre:Comedy,0.207227
7,0,Country:United States,0.192478
8,0,NCast:3_6,0.185103
9,0,Genre:Special Interest,0.153392


In [6]:
catalog = load_catalog(REPO_ROOT)
catalog["cluster"] = labels

catalog[["title", "platform", "type", "release_year", "cluster"]].sample(10)

Unnamed: 0,title,platform,type,release_year,cluster
15707,My Little Pony Equestria Girls: Forgotten Frie...,Netflix,Movie,2018,
724,The Bumble Nums - Season 4,Amazon Prime,Movie,2020,
19738,Tree Man,Netflix,Movie,2015,
14272,Seven and a half dates,Netflix,Movie,2018,
7108,Manmadhan,Amazon Prime,Movie,2004,
6807,Debug,Amazon Prime,Movie,2015,
19740,Tremors 2: Aftershocks,Netflix,Movie,1995,
19080,Scream 3,Netflix,Movie,2000,
11161,Jaws 3,Netflix,Movie,1983,
8725,Act 6 - Series 1,Amazon Prime,TV Show,2021,
