In [9]:
import os
import json
import time
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
from openai import OpenAI

In [10]:
load_dotenv()
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [None]:
INP = Path("data/raw_tvmaze.jsonl")
OUT = Path("data/shows.parquet")
PROGRESS = Path("data/shows_progress.parquet")

def read_jsonl(path):
    for line in path.open("r", encoding="utf-8"):
        yield json.loads(line)

SYSTEM = (
    "You are a TV guide copywriter. Write a 2–3 sentence plot summary "
    "for a TV programme so a new viewer understands the premise. "
)

USER_TEMPLATE = (
    "Programme name: {name}\n"
    "Known genres: {genres}\n"
    "Existing blurb (may be noisy): {blurb}\n\n"
    "Write the summary."
)

def make_summary(name, genres, blurb):
    # Robust single-call helper with minimal retries
    for attempt in range(3):
        try:
            resp = client.responses.create(
                model="gpt-4o-mini",  # fast/cheap; swap to larger model if you like
                input=[
                    {"role": "system", "content": SYSTEM},
                    {"role": "user", "content": USER_TEMPLATE.format(
                        name=name, genres=", ".join(genres or []), blurb=(blurb or "N/A")
                    )},
                ],
                max_output_tokens=140,
            )
            return resp.output_text.strip()
        except Exception as e:
            if attempt == 2: 
                raise
            time.sleep(1.5 * (attempt + 1))

#def main():
#    rows = []
#    for rec in tqdm(read_jsonl(INP), total=None, desc="Summarising"):
#        summary = make_summary(rec["name"], rec.get("genres"), rec.get("summary_html"))
#        rows.append({
#            **rec,
#            "summary": summary
#        })
#    df = pd.DataFrame(rows)
#    df.to_parquet(OUT, index=False)
#    print(f"Wrote {OUT} with {len(df)} rows")

#if __name__ == "__main__":
#    main()




In [None]:
def main(save_every=10, max_shows=None):
    """
    Generate summaries  
    Set max_shows to limit how many shows to process (for testing).
    """
    import itertools

    rows = []
    done_ids = set()
    if PROGRESS.exists():
        existing = pd.read_parquet(PROGRESS)
        if not existing.empty:
            rows = existing.to_dict("records")
            if "id" in existing.columns:
                done_ids = set(existing["id"].tolist())
            print(f"Resuming from progress: {len(rows)} shows already summarized")

    completed = False
    count_since_save = 0

    try:
        source_iter = read_jsonl(INP)
        if max_shows is not None:
            source_iter = itertools.islice(source_iter, max_shows)

        for rec in tqdm(source_iter, desc="Summarising", unit="show"):
            if rec.get("id") in done_ids:
                continue

            try:
                summary = make_summary(rec["name"], rec.get("genres"), rec.get("summary_html"))
            except Exception:
                summary = None

            rows.append({**rec, "summary": summary})
            done_ids.add(rec.get("id"))
            count_since_save += 1

            if count_since_save >= save_every:
                pd.DataFrame(rows).to_parquet(PROGRESS, index=False)
                print(f"Saved progress at {len(rows)} shows")
                count_since_save = 0

        completed = True

    except KeyboardInterrupt:
        print("\nInterrupted by user. Saving progress...")

    finally:
        pd.DataFrame(rows).to_parquet(PROGRESS, index=False)
        print(f"Progress saved: {len(rows)} shows")

        if completed:
            pd.DataFrame(rows).to_parquet(OUT, index=False)
            print(f"Done — wrote {OUT} with {len(rows)} rows")
        else:
            print("Re-run this script later to resume from the saved progress.")

if __name__ == "__main__":
    main(save_every=10 , max_shows=200)


Resuming from progress: 20 shows already summarized


Summarising: 30show [00:24,  1.80s/show]

Saved progress at 30 shows


Summarising: 40show [00:58,  3.27s/show]

Saved progress at 40 shows


Summarising: 50show [01:24,  2.75s/show]

Saved progress at 50 shows


Summarising: 60show [01:47,  2.21s/show]

Saved progress at 60 shows


Summarising: 70show [02:11,  2.28s/show]

Saved progress at 70 shows


Summarising: 80show [02:39,  2.96s/show]

Saved progress at 80 shows


Summarising: 90show [03:03,  2.47s/show]

Saved progress at 90 shows


Summarising: 100show [03:26,  2.50s/show]

Saved progress at 100 shows


Summarising: 110show [03:48,  2.15s/show]

Saved progress at 110 shows


Summarising: 120show [04:11,  2.20s/show]

Saved progress at 120 shows


Summarising: 130show [04:34,  2.13s/show]

Saved progress at 130 shows


Summarising: 140show [05:00,  2.81s/show]

Saved progress at 140 shows


Summarising: 150show [05:24,  2.46s/show]

Saved progress at 150 shows


Summarising: 160show [05:49,  2.34s/show]

Saved progress at 160 shows


Summarising: 170show [06:16,  2.47s/show]

Saved progress at 170 shows


Summarising: 180show [06:43,  2.43s/show]

Saved progress at 180 shows


Summarising: 190show [07:05,  2.27s/show]

Saved progress at 190 shows


Summarising: 200show [07:31,  2.26s/show]

Saved progress at 200 shows
Progress saved: 200 shows
✅ Done — wrote data\shows.parquet with 200 rows





In [13]:
 

df = pd.read_parquet("data/shows.parquet")
df.head()

Unnamed: 0,id,name,genres,summary_html,language,status,officialSite,premiered,rating,network,webChannel,summary
0,1,Under the Dome,"[Drama, Science-Fiction, Thriller]",<p><b>Under the Dome</b> is the story of a sma...,English,Ended,http://www.cbs.com/shows/under-the-dome/,2013-06-24,6.5,CBS,,**Under the Dome** follows the residents of Ch...
1,2,Person of Interest,"[Action, Crime, Science-Fiction]",<p>You are being watched. The government has a...,English,Ended,http://www.cbs.com/shows/person_of_interest/,2011-09-22,8.8,CBS,,"In ""Person of Interest,"" a reclusive billionai..."
2,3,Bitten,"[Drama, Horror, Romance]",<p>Based on the critically acclaimed series of...,English,Ended,http://bitten.space.ca/,2014-01-11,7.4,CTV Sci-Fi Channel,,"In *Bitten*, follow Elena Michaels, the world’..."
3,4,Arrow,"[Drama, Action, Science-Fiction]","<p>After a violent shipwreck, billionaire play...",English,Ended,http://www.cwtv.com/shows/arrow,2012-10-10,7.4,The CW,,"In ""Arrow,"" billionaire playboy Oliver Queen r..."
4,5,True Detective,"[Drama, Crime, Thriller]",<p>Touch darkness and darkness touches you bac...,English,Running,https://www.max.com/shows/true-detective/9a4a3...,2014-01-12,8.1,HBO,,"In **True Detective**, each season unveils a g..."


In [17]:
import pandas as pd
import json
from bs4 import BeautifulSoup

# Load raw data
with open("data/raw_tvmaze.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]
df_raw = pd.DataFrame(data)

# Clean HTML
df_raw["summary_text"] = df_raw["summary_html"].apply(
    lambda s: BeautifulSoup(s or "", "html.parser").get_text().strip()
)

# Load AI summaries
df_ai = pd.read_parquet("data/shows.parquet")

# Merge on ID
merged = pd.merge(
    df_raw,
    df_ai[["id", "summary"]],
    on="id",
    how="left",
    suffixes=("_original", "_ai")
)

# Rename for clarity
merged.rename(columns={
    "summary_text": "original_summary",
    "summary": "ai_summary"
}, inplace=True)

# Save merged dataset
merged.to_parquet("data/shows_merged.parquet", index=False)
print(f"Merged file saved: data/shows_merged.parquet with {len(merged)} shows")

# Quick check
merged[["name", "genres", "original_summary", "ai_summary"]].head()

Merged file saved: data/shows_merged.parquet with 485 shows


Unnamed: 0,name,genres,original_summary,ai_summary
0,Under the Dome,"[Drama, Science-Fiction, Thriller]",Under the Dome is the story of a small town th...,**Under the Dome** follows the residents of Ch...
1,Person of Interest,"[Action, Crime, Science-Fiction]",You are being watched. The government has a se...,"In ""Person of Interest,"" a reclusive billionai..."
2,Bitten,"[Drama, Horror, Romance]",Based on the critically acclaimed series of no...,"In *Bitten*, follow Elena Michaels, the world’..."
3,Arrow,"[Drama, Action, Science-Fiction]","After a violent shipwreck, billionaire playboy...","In ""Arrow,"" billionaire playboy Oliver Queen r..."
4,True Detective,"[Drama, Crime, Thriller]",Touch darkness and darkness touches you back. ...,"In **True Detective**, each season unveils a g..."


In [16]:
# Export to CSV for manual review
merged[["id", "name", "genres", "original_summary", "ai_summary"]].to_csv(
    "data/shows_for_review.csv", index=False, encoding="utf-8"
)