In [1]:
import pandas as pd
import os
data = pd.read_csv("./viewership_tv_aug.csv")

In [3]:
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List
import os 

class ProgramMetadata(BaseModel):
    program_name: str = Field(..., description="Exact TV show title")
    base_genre: str = Field(..., description="Main genre like Comedy, Drama, Action, Romance")
    subgenres: List[str] = Field(
        default_factory=list,
        description="More specific subgenres for the show"
    )
    themes: List[str] = Field(
        default_factory=list,
        description="High-level themes that the show explores"
    )
    tone: List[str] = Field(
        default_factory=list,
        description="Descriptors of the tone, e.g. gritty, light-hearted, satirical"
    )
    pacing: str = Field(
        ..., description="Short description of pacing such as 'slow-burn serialized' or 'fast-paced episodic'"
    )
    target_audience: str = Field(
        ..., description="Intended audience such as 'family', 'young adults', 'mature'"
    )
    violence_level: int = Field(
        ..., ge=1, le=5, description="Violence intensity from 1 (none) to 5 (very high)"
    )
    sexual_content_level: int = Field(
        ..., ge=1, le=5, description="Sexual content from 1 (none) to 5 (very explicit)"
    )
    language_intensity: int = Field(
        ..., ge=1, le=5, description="Strength of language from 1 (clean) to 5 (very strong language)"
    )
    suitable_slots: List[str] = Field(
        default_factory=list,
        description="Recommended weekly time ranges like 'Friday 21:00-22:00'"
    )


llm = ChatOpenAI(
    model="gpt-4.1",  # or "gpt-4.1-mini" / whatever you're using
    temperature=0.2,
)

# This is the magic part: force LLM to return JSON that matches ProgramMetadata
structured_llm = llm.with_structured_output(ProgramMetadata)


In [4]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "You are a TV content metadata expert. "
     "Given a TV show title and its base genre, "
     "you must fill the ProgramMetadata schema accurately."),
    ("user",
     "Title: {title}\n"
     "Base genre: {genre}\n\n"
     "Analyze the show and fill all fields in the schema.")
])
chain = prompt | structured_llm

In [5]:
def enrich_program(title: str, genre: str) -> ProgramMetadata:
    """Call the LLM + structured output to get rich metadata for one show."""
    return chain.invoke({"title": title, "genre": genre})


In [6]:
programs = [
    ("Seinfeld", "Comedy"),
    ("Arrested Development", "Comedy"),
    ("Brooklyn Nine-Nine", "Comedy"),
    ("The Crown", "Drama"),
    ("The Wire", "Drama"),
    ("Mad Men", "Drama"),
    ("The Mandalorian", "Action"),
    ("The Boys", "Action"),
    ("Prison Break", "Action"),
    ("Outlander", "Romance"),
]

test_metadata_objects: list[ProgramMetadata] = []
for title, genre in programs:
    meta = enrich_program(title, genre)
    test_metadata_objects.append(meta)

In [None]:
import pandas as pd

test_data_meta_df = pd.DataFrame([m.dict() for m in test_metadata_objects])
test_data_meta_df

In [16]:
train_metadata_objects: list[ProgramMetadata] = []

for idx, row in data[["ProgramName", "Genre"]].drop_duplicates().iterrows():
    title = row["ProgramName"]
    genre = row["Genre"]

    print(f"Generating metadata for: {title} ({genre}) ...")

    try:
        meta = enrich_program(title, genre)   # Structured LLM call
        train_metadata_objects.append(meta)
    except Exception as e:
        print(f"Error for {title}: {e}")

Generating metadata for: Stranger Things (Drama) ...
Generating metadata for: Vikings (Action) ...
Generating metadata for: Squid Game (Action) ...
Generating metadata for: House (Romance) ...
Generating metadata for: The Office (Comedy) ...
Generating metadata for: Bridgerton (Romance) ...
Generating metadata for: Good Morning America (Morning Show) ...
Generating metadata for: Breaking Bad (Drama) ...
Generating metadata for: The Sopranos (Drama) ...
Generating metadata for: Suits (Drama) ...
Generating metadata for: The News (News) ...
Generating metadata for: The Walking Dead (Drama) ...
Generating metadata for: Football Match (Sports) ...
Generating metadata for: The Big Bang Theory (Comedy) ...
Generating metadata for: How I Met Your Mother (Comedy) ...
Generating metadata for: Grey's Anatomy (Romance) ...
Generating metadata for: Peaky Blinders (Action) ...
Generating metadata for: Friends (Comedy) ...
Generating metadata for: Game Of Thrones (Action) ...
Generating metadata for

In [20]:
train_data_meta_df = pd.DataFrame([m.dict() for m in train_metadata_objects])
train_data_meta_df.to_csv("Train Programs.xlsx", index=False)

In [21]:
test_data_meta_df.to_csv("Test Programs.xlsx", index=False)