In [12]:
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [13]:
#pip install sentence-transformers torch

In [14]:
# Input dataset with summaries
INP = Path("data/shows_merged.parquet")

# Output directory and file paths
VEC = Path("vectors/summaries.npy")                  # embeddings array
INDEX_PATH = Path("vectors/summaries_index.parquet") # show ID/name index
VEC.parent.mkdir(parents=True, exist_ok=True)

# Model choice
MODEL_NAME = "all-MiniLM-L6-v2"  # or another Sentence Transformer model


In [15]:
def main(batch_size=256):
    print(f"\nCurrent working directory: {Path.cwd()}")
    print(f"Reading input file: {INP.resolve()}")

    # --- Load only needed columns ---
    df = pd.read_parquet(INP, columns=["ai_summary", "id", "name"])

    # --- Drop empty summaries ---
    df = df[df["ai_summary"].notna() & (df["ai_summary"].str.strip() != "")].reset_index(drop=True)
    texts = df["ai_summary"].tolist()

    print(f"Rows with ai_summary: {len(texts)}")
    if len(texts) == 0:
        print("No summaries found to embed. Check that your merge and summarization steps ran correctly.")
        return

    # --- Save an index file for later (so rows align with embeddings) ---
    df[["id", "name"]].to_parquet(INDEX_PATH, index=False)
    print(f"Saved index: {INDEX_PATH.resolve()}")

    # --- Load model and compute embeddings ---
    model = SentenceTransformer(MODEL_NAME)
    dim = model.get_sentence_embedding_dimension()
    embeddings = []

    for start in tqdm(range(0, len(texts), batch_size), desc="Embedding", unit="show"):
        batch = texts[start : start + batch_size]
        embs = model.encode(
            batch,
            batch_size=batch_size,
            convert_to_numpy=True,
            normalize_embeddings=True
        )
        embeddings.append(embs)

    X = np.vstack(embeddings).astype(np.float32)

    # --- Save embeddings ---
    np.save(VEC, X)
    print(f"Saved embeddings to: {VEC.resolve()}")
    print(f"Shape: {X.shape}  (num_shows x dim={dim})")

if __name__ == "__main__":
    main()


Current working directory: c:\Users\brethm01\tv-nlp\src
Reading input file: C:\Users\brethm01\tv-nlp\src\data\shows_merged.parquet
Rows with ai_summary: 200
Saved index: C:\Users\brethm01\tv-nlp\src\vectors\summaries_index.parquet


Embedding: 100%|██████████| 1/1 [00:03<00:00,  3.45s/show]

Saved embeddings to: C:\Users\brethm01\tv-nlp\src\vectors\summaries.npy
Shape: (200, 384)  (num_shows x dim=384)





In [16]:
import os
print(os.path.exists("vectors"))
print(os.listdir("vectors") if os.path.exists("vectors") else "No folder")

True
['summaries.npy', 'summaries_index.parquet']


In [17]:
# Checks 

import numpy as np

X = np.load("vectors/summaries.npy")
print(type(X))
print(X.shape)

<class 'numpy.ndarray'>
(200, 384)


In [18]:
print(X[0][:10])  # first 10 numbers of the first vector

[-0.00150473  0.11482527 -0.00861445  0.03905157  0.05578645 -0.00723386
  0.04783944 -0.03702857  0.00209945 -0.02428848]


In [19]:
# attach them back to the shows 


df = pd.read_parquet("data/shows_merged.parquet")
df = df[df["ai_summary"].notna() & (df["ai_summary"].str.strip() != "")].reset_index(drop=True)

print(df[["name", "ai_summary"]].head(3))
print(X[:3])

                 name                                         ai_summary
0      Under the Dome  **Under the Dome** follows the residents of Ch...
1  Person of Interest  In "Person of Interest," a reclusive billionai...
2              Bitten  In *Bitten*, follow Elena Michaels, the world’...
[[-0.00150473  0.11482527 -0.00861445 ...  0.05430867 -0.0103502
   0.03073471]
 [-0.05603698 -0.02702111 -0.1008964  ... -0.0392012  -0.04705472
  -0.03598273]
 [-0.00344613  0.01225147 -0.0466712  ... -0.07568082  0.03041628
  -0.10293239]]
