In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import ast

DATA = Path("data")
REPORTS = Path("reports"); REPORTS.mkdir(parents=True, exist_ok=True)

# ---- Load sources
review   = pd.read_csv(DATA / "shows_for_review.csv")                 # id, name, genres, original_summary, ai_summary
clustered = pd.read_parquet(DATA / "shows_with_cluster_labels.parquet")# id, name, u1, u2, cluster, cluster_label
classified = pd.read_parquet(DATA / "shows_classified.parquet")        # id, name, pred_category, pred_score, ...

# ---- Ensure genres is list-like
def to_list_maybe(x):
    if isinstance(x, list): return x
    if isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try: return list(ast.literal_eval(s))
            except Exception: pass
        return [t.strip() for t in s.split(",")] if s else []
    return []

review["genres"] = review["genres"].apply(to_list_maybe)

# ---- Merge into a single master table
cols_keep = ["id","name","genres","original_summary","ai_summary"]
master = review[cols_keep].merge(
    clustered[["id","cluster","cluster_label","u1","u2"]],
    on="id", how="left"
).merge(
    classified[["id","pred_category","pred_score"]],
    on="id", how="left"
)

# ---- Nice per-show view & export
per_show = master.copy()
per_show["genres_joined"] = per_show["genres"].apply(lambda g: ", ".join(g) if isinstance(g, list) else "")
per_show_view = per_show[[
    "id","name","genres_joined","pred_category","pred_score","cluster","cluster_label"
]].sort_values(["pred_category","cluster","name"])

out_csv = DATA / "shows_comparison_per_show.csv"
per_show_view.to_csv(out_csv, index=False, encoding="utf-8")
print(f"‚úÖ Per-show comparison saved ‚Üí {out_csv}")

# ---- Helper: crosstab utilities
def cross_share(df, rows, cols):
    ct = pd.crosstab(df[rows], df[cols])
    share = ct.div(ct.sum(axis=1).replace(0, np.nan), axis=0).fillna(0)
    return ct, share

# We need a row per (show, genre) for genre comparisons
exploded = master.explode("genres")

# ---- 1) Category ‚Üî Genres
cat_gen_ct, cat_gen_share = cross_share(exploded, "pred_category", "genres")
cat_gen_ct.to_csv(DATA / "ct_category_by_genre_counts.csv")
cat_gen_share.to_csv(DATA / "ct_category_by_genre_share.csv")
print("üìä Saved: category √ó genre (counts + row-normalized share)")

# ---- 2) Category ‚Üî Cluster
cat_cluster_ct, cat_cluster_share = cross_share(master, "pred_category", "cluster_label")
cat_cluster_ct.to_csv(DATA / "ct_category_by_cluster_counts.csv")
cat_cluster_share.to_csv(DATA / "ct_category_by_cluster_share.csv")
print("üìä Saved: category √ó cluster (counts + row-normalized share)")

# ---- 3) Genres ‚Üî Cluster
genre_cluster_ct, genre_cluster_share = cross_share(exploded, "genres", "cluster_label")
genre_cluster_ct.to_csv(DATA / "ct_genre_by_cluster_counts.csv")
genre_cluster_share.to_csv(DATA / "ct_genre_by_cluster_share.csv")
print("üìä Saved: genre √ó cluster (counts + row-normalized share)")

# ---- On-screen previews (top 10 rows)
print("\n=== Category √ó Genre (share) ‚Äî preview ===")
print(cat_gen_share.head(10).round(3).to_string())

print("\n=== Category √ó Cluster (share) ‚Äî preview ===")
print(cat_cluster_share.round(3).to_string())

print("\n=== Genre √ó Cluster (share) ‚Äî preview ===")
print(genre_cluster_share.head(20).round(3).to_string())

# ---- Optional: top examples per category/genre/cluster for quick reading
def top_examples(frame, key_col, k=5):
    rows = []
    for key, sub in frame.groupby(key_col):
        ex = sub.sort_values("pred_score", ascending=False).head(k)["name"].tolist() if "pred_score" in sub else sub.head(k)["name"].tolist()
        rows.append({"key": key, "examples": ", ".join(ex)})
    return pd.DataFrame(rows).sort_values("key")

top_cat_examples = top_examples(master, "pred_category", k=5)
top_cat_examples.to_csv(DATA / "examples_per_category.csv", index=False, encoding="utf-8")
print("üóÇÔ∏è Saved: examples_per_category.csv")

top_cluster_examples = top_examples(master, "cluster_label", k=5)
top_cluster_examples.to_csv(DATA / "examples_per_cluster.csv", index=False, encoding="utf-8")
print("üóÇÔ∏è Saved: examples_per_cluster.csv")

top_genre_examples = top_examples(exploded, "genres", k=5)
top_genre_examples.to_csv(DATA / "examples_per_genre.csv", index=False, encoding="utf-8")
print("üóÇÔ∏è Saved: examples_per_genre.csv")


‚úÖ Per-show comparison saved ‚Üí data\shows_comparison_per_show.csv
üìä Saved: category √ó genre (counts + row-normalized share)
üìä Saved: category √ó cluster (counts + row-normalized share)
üìä Saved: genre √ó cluster (counts + row-normalized share)

=== Category √ó Genre (share) ‚Äî preview ===
genres         Action  Adventure  Anime  Comedy  Crime  Drama  Espionage  Family  Fantasy  History  Horror  Legal  Medical  Music  Mystery  Romance  Science-Fiction  Supernatural  Thriller    War  Western
pred_category                                                                                                                                                                                               
Comedy          0.020      0.000  0.000   0.480  0.040  0.180      0.000   0.140    0.000    0.000   0.000  0.000    0.060  0.000    0.020    0.040            0.000         0.000     0.020  0.000    0.000
Crime           0.111      0.015  0.007   0.044  0.274  0.296      0.007   0.007  

In [3]:
# ==========================================================
# üìä VISUALIZATIONS
# ==========================================================
import matplotlib.pyplot as plt
import seaborn as sns  # optional, for nice heatmaps; pip install seaborn

REPORTS.mkdir(parents=True, exist_ok=True)

# ---- Helper: top-n stack bar or heatmap
def save_heatmap(df, title, fname):
    plt.figure(figsize=(10, 6))
    sns.heatmap(df, cmap="YlGnBu", linewidths=0.5)
    plt.title(title)
    plt.tight_layout()
    out = REPORTS / fname
    plt.savefig(out, dpi=200)
    plt.close()
    print(f"‚úÖ Saved heatmap: {out}")

def save_bar(data, title, xlabel, ylabel, fname):
    plt.figure(figsize=(9, 5))
    data.plot(kind="bar")
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    out = REPORTS / fname
    plt.savefig(out, dpi=200)
    plt.close()
    print(f"‚úÖ Saved bar chart: {out}")

# ---- 1Ô∏è‚É£ Category ‚Üî Genre
cat_gen_share = pd.read_csv(DATA / "ct_category_by_genre_share.csv", index_col=0)

# Top 10 genres per category (flat table)
cat_gen_melt = (
    cat_gen_share
    .reset_index()
    .melt(id_vars="pred_category", var_name="genre", value_name="share")
    .sort_values(["pred_category", "share"], ascending=[True, False])
)
top_genres = cat_gen_melt.groupby("pred_category").head(5)

plt.figure(figsize=(10, 6))
sns.barplot(data=top_genres, x="share", y="genre", hue="pred_category", dodge=False)
plt.title("Top Genres per Predicted Category")
plt.xlabel("Share within category")
plt.ylabel("Genre")
plt.tight_layout()
out = REPORTS / "category_by_genre_top.png"
plt.savefig(out, dpi=200)
plt.close()
print(f"‚úÖ Saved: {out}")

# ---- 2Ô∏è‚É£ Category ‚Üî Cluster (heatmap)
cat_cluster_share = pd.read_csv(DATA / "ct_category_by_cluster_share.csv", index_col=0)
save_heatmap(cat_cluster_share, "Category vs Cluster (row-normalized)", "category_by_cluster_heatmap.png")

# ---- 3Ô∏è‚É£ Genre ‚Üî Cluster (heatmap)
genre_cluster_share = pd.read_csv(DATA / "ct_genre_by_cluster_share.csv", index_col=0)
# optional: trim to top N genres to keep it readable
topN = 20
top_genres = genre_cluster_share.sum(axis=1).sort_values(ascending=False).head(topN).index
save_heatmap(genre_cluster_share.loc[top_genres], f"Top {topN} Genres vs Cluster (row-normalized)", "genre_by_cluster_heatmap.png")

# ---- 4Ô∏è‚É£ Predicted category counts (bar chart)
shows = pd.read_parquet(DATA / "shows_classified.parquet")
counts = shows["pred_category"].value_counts().sort_values(ascending=False)
save_bar(counts, "Number of Shows per Predicted Category", "Category", "Count", "category_counts.png")

# ---- 5Ô∏è‚É£ Cluster counts (bar chart)
cluster_counts = shows["cluster_label"].value_counts().sort_values(ascending=False)
save_bar(cluster_counts, "Number of Shows per Cluster (semantic labeling)", "Cluster label", "Count", "cluster_counts.png")

print("\nüé® All charts exported to:", REPORTS.resolve())


‚úÖ Saved: reports\category_by_genre_top.png
‚úÖ Saved heatmap: reports\category_by_cluster_heatmap.png
‚úÖ Saved heatmap: reports\genre_by_cluster_heatmap.png
‚úÖ Saved bar chart: reports\category_counts.png
‚úÖ Saved bar chart: reports\cluster_counts.png

üé® All charts exported to: C:\Users\brethm01\tv-nlp\src\reports


In [5]:
# ==========================================================
# üìè Robust evaluation for clustering & classification
# ==========================================================
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    normalized_mutual_info_score,
)
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from pathlib import Path
import ast

DATA = Path("data")
shows = pd.read_parquet(DATA / "shows_classified.parquet")
print(f"‚úÖ Loaded {len(shows)} shows for evaluation")

def to_list_maybe(x):
    if isinstance(x, list): return x
    if isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try: return list(ast.literal_eval(s))
            except Exception: pass
        return [t.strip() for t in s.split(",")] if s else []
    return []

# ---------- Clean subsets for each metric ----------
# A) Internal cluster metrics need UMAP coords + cluster labels
have_coords = shows[["u1","u2"]].notna().all(axis=1) if set(["u1","u2"]).issubset(shows.columns) else pd.Series(False, index=shows.index)
have_cluster = shows["cluster"].notna() if "cluster" in shows.columns else pd.Series(False, index=shows.index)
A = shows[have_coords & have_cluster].copy()

print(f"üßπ Rows with valid UMAP+cluster for internal metrics: {len(A)}")
if len(A) > 0:
    # ensure numeric
    X = A[["u1","u2"]].astype(float).values
    y_cluster = A["cluster"].astype(str).values
    n_clusters = len(pd.unique(y_cluster))
    print(f"   Distinct clusters in A: {n_clusters}")
else:
    X = np.empty((0,2)); y_cluster = np.array([]); n_clusters = 0

sil = ch = db = None
if len(A) >= 10 and n_clusters >= 2 and all((A.groupby("cluster").size() >= 2)):  # need enough points
    sil = float(silhouette_score(X, y_cluster))
    ch  = float(calinski_harabasz_score(X, y_cluster))
    db  = float(davies_bouldin_score(X, y_cluster))
    print("\nüß© Cluster internal quality:")
    print(f"   ‚Ä¢ Silhouette Score:        {sil:.3f}  (higher better)")
    print(f"   ‚Ä¢ Calinski-Harabasz Score: {ch:.1f}   (higher better)")
    print(f"   ‚Ä¢ Davies-Bouldin Score:    {db:.3f}  (lower better)")
else:
    print("\n‚ö†Ô∏è Skipping internal cluster metrics (need ‚â•10 rows with coords and ‚â•2 clusters, with ‚â•2 pts each).")

# B) Agreement: clusters ‚Üî predicted categories
B = shows.copy()
if "pred_category" not in B.columns:
    raise ValueError("Missing 'pred_category'. Run classification first.")
B["pred_category"] = B["pred_category"].fillna("None")
B = B[B["cluster"].notna()] if "cluster" in B.columns else B.iloc[0:0]
ari = nmi = None
if len(B) >= 2 and B["cluster"].nunique() >= 2 and B["pred_category"].nunique() >= 2:
    le_cluster = LabelEncoder().fit_transform(B["cluster"].astype(str))
    le_category = LabelEncoder().fit_transform(B["pred_category"].astype(str))
    ari = float(adjusted_rand_score(le_cluster, le_category))
    nmi = float(normalized_mutual_info_score(le_cluster, le_category))
    print("\nüéØ Alignment: Clusters ‚Üî Predicted Categories")
    print(f"   ‚Ä¢ ARI: {ari:.3f}   ‚Ä¢ NMI: {nmi:.3f}")
else:
    print("\n‚ö†Ô∏è Skipping clusters‚Üîcategories alignment (need ‚â•2 distinct labels and enough rows).")

# C) Agreement: genres ‚Üî categories / clusters (use primary genre if any)
C = shows.copy()
C["genres"] = C["genres"].apply(to_list_maybe)
C["main_genre"] = C["genres"].apply(lambda g: g[0] if isinstance(g, list) and len(g) else None)
mask = C["main_genre"].notna()

ari_gc = nmi_gc = ari_gcl = nmi_gcl = None
if mask.sum() >= 10:
    # genres ‚Üî categories
    sub = C.loc[mask].copy()
    if sub["pred_category"].notna().sum() >= 2 and sub["pred_category"].nunique() >= 2 and sub["main_genre"].nunique() >= 2:
        le_genre = LabelEncoder().fit_transform(sub["main_genre"].astype(str))
        le_cat   = LabelEncoder().fit_transform(sub["pred_category"].fillna("None").astype(str))
        ari_gc = float(adjusted_rand_score(le_genre, le_cat))
        nmi_gc = float(normalized_mutual_info_score(le_genre, le_cat))
        print("\nüé¨ Alignment: Genres ‚Üî Predicted Categories")
        print(f"   ‚Ä¢ ARI: {ari_gc:.3f}   ‚Ä¢ NMI: {nmi_gc:.3f}")
    else:
        print("\n‚ö†Ô∏è Skipping genres‚Üîcategories (not enough distinct labels).")

    # genres ‚Üî clusters
    if "cluster" in sub.columns and sub["cluster"].notna().sum() >= 2 and sub["cluster"].nunique() >= 2:
        le_cluster_g = LabelEncoder().fit_transform(sub["cluster"].astype(str))
        ari_gcl = float(adjusted_rand_score(le_genre, le_cluster_g))
        nmi_gcl = float(normalized_mutual_info_score(le_genre, le_cluster_g))
        print("\nüìö Alignment: Genres ‚Üî Clusters")
        print(f"   ‚Ä¢ ARI: {ari_gcl:.3f}   ‚Ä¢ NMI: {nmi_gcl:.3f}")
    else:
        print("\n‚ö†Ô∏è Skipping genres‚Üîclusters (not enough distinct cluster labels).")
else:
    print("\n‚ö†Ô∏è Skipping all genre-based metrics (need ‚â•10 rows with non-empty genres).")

# ---------- Save summary (allow None values) ----------
metrics_df = pd.DataFrame([{
    "silhouette": sil,
    "calinski_harabasz": ch,
    "davies_bouldin": db,
    "ari_cluster_vs_category": ari,
    "nmi_cluster_vs_category": nmi,
    "ari_genre_vs_category": ari_gc,
    "nmi_genre_vs_category": nmi_gc,
    "ari_genre_vs_cluster": ari_gcl,
    "nmi_genre_vs_cluster": nmi_gcl,
    "rows_internal_metrics": len(A),
    "rows_alignment_metrics": len(B),
    "rows_with_genres": int(mask.sum())
}])
out = DATA / "evaluation_metrics.csv"
metrics_df.to_csv(out, index=False)
print(f"\nüíæ Saved metrics to: {out.resolve()}")


‚úÖ Loaded 485 shows for evaluation
üßπ Rows with valid UMAP+cluster for internal metrics: 200
   Distinct clusters in A: 5

üß© Cluster internal quality:
   ‚Ä¢ Silhouette Score:        0.418  (higher better)
   ‚Ä¢ Calinski-Harabasz Score: 221.5   (higher better)
   ‚Ä¢ Davies-Bouldin Score:    0.839  (lower better)

üéØ Alignment: Clusters ‚Üî Predicted Categories
   ‚Ä¢ ARI: 0.192   ‚Ä¢ NMI: 0.349

‚ö†Ô∏è Skipping all genre-based metrics (need ‚â•10 rows with non-empty genres).

üíæ Saved metrics to: C:\Users\brethm01\tv-nlp\src\data\evaluation_metrics.csv


Predicted Category Confidence Evaluation

In [6]:
# ==========================================================
# üéØ Evaluate prediction confidence (pred_score) per category
# ==========================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

DATA = Path("data")
REPORTS = Path("reports"); REPORTS.mkdir(parents=True, exist_ok=True)

# ---- Load your classified data
shows = pd.read_parquet(DATA / "shows_classified.parquet")
print(f"‚úÖ Loaded {len(shows)} shows")

# ---- Drop missing scores
shows = shows.dropna(subset=["pred_category", "pred_score"])

# ---- Aggregate: average confidence per category
confidence_stats = (
    shows.groupby("pred_category")["pred_score"]
    .agg(["count", "mean", "median", "std", "min", "max"])
    .sort_values("mean", ascending=False)
    .reset_index()
)

# ---- Save to CSV
out_csv = DATA / "category_confidence_summary.csv"
confidence_stats.to_csv(out_csv, index=False, encoding="utf-8")
print(f"üíæ Saved: {out_csv.resolve()}")

# ---- Show top categories by average confidence
print("\nüìà Average confidence per category:")
print(confidence_stats.round(3).to_string(index=False))

# ---- Visualization 1: Average confidence (bar chart)
plt.figure(figsize=(8, 5))
sns.barplot(data=confidence_stats, x="pred_category", y="mean", palette="crest")
plt.title("Average Confidence (pred_score) per Predicted Category")
plt.ylabel("Mean pred_score (0‚Äì1)")
plt.xlabel("Predicted Category")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig(REPORTS / "avg_confidence_per_category.png", dpi=200)
plt.close()
print(f"‚úÖ Saved: {REPORTS / 'avg_confidence_per_category.png'}")

# ---- Visualization 2: Boxplot of score distribution
plt.figure(figsize=(8, 5))
sns.boxplot(data=shows, x="pred_category", y="pred_score", palette="crest")
plt.title("Distribution of Confidence Scores per Category")
plt.ylabel("pred_score")
plt.xlabel("Predicted Category")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig(REPORTS / "confidence_distribution_boxplot.png", dpi=200)
plt.close()
print(f"‚úÖ Saved: {REPORTS / 'confidence_distribution_boxplot.png'}")

# ---- (Optional) Quick insi


‚úÖ Loaded 485 shows
üíæ Saved: C:\Users\brethm01\tv-nlp\src\data\category_confidence_summary.csv

üìà Average confidence per category:
pred_category  count  mean  median   std   min   max
         News      1 0.444   0.444   NaN 0.444 0.444
      Reality      7 0.353   0.361 0.042 0.284 0.418
       Comedy     27 0.351   0.355 0.083 0.192 0.529
        Crime     49 0.346   0.320 0.099 0.174 0.616
         Kids      2 0.330   0.330 0.025 0.312 0.347
       Sci-Fi     22 0.302   0.302 0.058 0.208 0.410
        Drama     68 0.301   0.301 0.055 0.173 0.439
      Fantasy     20 0.296   0.291 0.077 0.187 0.467
  Documentary      3 0.288   0.276 0.062 0.232 0.355
       Sports      1 0.271   0.271   NaN 0.271 0.271



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=confidence_stats, x="pred_category", y="mean", palette="crest")


‚úÖ Saved: reports\avg_confidence_per_category.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=shows, x="pred_category", y="pred_score", palette="crest")


‚úÖ Saved: reports\confidence_distribution_boxplot.png
