In [1]:
import httpx
import json
import time
import sys
import argparse     
from pathlib import Path

In [2]:
import httpx, json, time, sys
from pathlib import Path

OUT = Path("data/raw_tvmaze.jsonl")
OUT.parent.mkdir(parents=True, exist_ok=True)

In [3]:
def fetch_pages(max_pages=5, sleep=0.5):
    client = httpx.Client(timeout=30)
    with OUT.open("w", encoding="utf-8") as f:
        for page in range(max_pages):
            r = client.get(f"https://api.tvmaze.com/shows?page={page}")
            r.raise_for_status()
            for show in r.json():
                rec = {
                    "id": show.get("id"),
                    "name": show.get("name"),
                    "genres": show.get("genres", []),
                    "summary_html": show.get("summary"),
                    "language": show.get("language"),
                    "status": show.get("status"),
                    "officialSite": show.get("officialSite"),
                    "premiered": show.get("premiered"),
                    "rating": (show.get("rating") or {}).get("average"),
                    "network": (show.get("network") or {}).get("name"),
                    "webChannel": (show.get("webChannel") or {}).get("name"),
                }
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
            time.sleep(sleep)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Fetch TVMaze shows")
    parser.add_argument("-p", "--pages", type=int, default=2, help="Number of pages to fetch")
    parser.add_argument("--sleep", type=float, default=0.5, help="Delay between pages (seconds)")
    args, _unknown = parser.parse_known_args()
    fetch_pages(max_pages=args.pages, sleep=args.sleep)

In [None]:
import pandas as pd
import json
from bs4 import BeautifulSoup

# Read JSONL into a DataFrame
with open("data/raw_tvmaze.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

df = pd.DataFrame(data)

df["summary_text"] = df["summary_html"].apply(
    lambda s: BeautifulSoup(s or "", "html.parser").get_text().strip()
)
df[["name", "genres", "summary_text"]].head()


df.head(5)

Unnamed: 0,id,name,genres,summary_html,language,status,officialSite,premiered,rating,network,webChannel,summary_text
0,1,Under the Dome,"[Drama, Science-Fiction, Thriller]",<p><b>Under the Dome</b> is the story of a sma...,English,Ended,http://www.cbs.com/shows/under-the-dome/,2013-06-24,6.5,CBS,,Under the Dome is the story of a small town th...
1,2,Person of Interest,"[Action, Crime, Science-Fiction]",<p>You are being watched. The government has a...,English,Ended,http://www.cbs.com/shows/person_of_interest/,2011-09-22,8.8,CBS,,You are being watched. The government has a se...
2,3,Bitten,"[Drama, Horror, Romance]",<p>Based on the critically acclaimed series of...,English,Ended,http://bitten.space.ca/,2014-01-11,7.4,CTV Sci-Fi Channel,,Based on the critically acclaimed series of no...
3,4,Arrow,"[Drama, Action, Science-Fiction]","<p>After a violent shipwreck, billionaire play...",English,Ended,http://www.cwtv.com/shows/arrow,2012-10-10,7.4,The CW,,"After a violent shipwreck, billionaire playboy..."
4,5,True Detective,"[Drama, Crime, Thriller]",<p>Touch darkness and darkness touches you bac...,English,Running,https://www.max.com/shows/true-detective/9a4a3...,2014-01-12,8.1,HBO,,Touch darkness and darkness touches you back. ...


In [None]:

len(df)

485

In [None]:
# shows per network
df["network"].value_counts().head(10)

network
NBC                         43
ABC                         39
CBS                         36
FOX                         33
HBO                         22
Syfy                        19
Paramount+ with Showtime    17
BBC One                     16
The CW                      16
TNT                         14
Name: count, dtype: int64