<a href="https://colab.research.google.com/github/Kensheretohelp/were-here-to-help-podcast-analytics/blob/main/TRIALMissingEpisodes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install playwright
!playwright install chromium

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Please install them with the following command:      ║
║                                                      ║
║     playwright install-deps                          ║
║                                                      ║
║ Alternatively, use apt:                              ║
║     apt-get install libatk1.0-0\                     ║
║         libatk-bridge2.0-0\                          ║
║         libatspi2.0-0\                               ║
║         libxcomposite1                               ║
║                                                      ║
║ <3 Playwright Team                                   ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:269:9)
    at async Registry._validateHostRequirements (/usr/local

In [4]:
import os
import re
import json
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from openai import OpenAI

# ---------------------------------------------------------
# CONFIGURATION
# ---------------------------------------------------------

client = OpenAI(api_key="xxxxxxxxxxxx")   # or just OpenAI() if env var is set

HEADERS = {
    "User-Agent": "Mozilla/5.0 (WH2H Missing Episodes Scraper)",
    "Accept": "text/html",
}

REQUEST_SLEEP = 2


# ---------------------------------------------------------
# SAFE HTTP GET
# ---------------------------------------------------------
def safe_get(url, retries=5):
    for attempt in range(retries):
        try:
            r = requests.get(url, headers=HEADERS, timeout=25)
            if r.status_code == 200:
                return r.text
            else:
                print(f"  !! HTTP {r.status_code} at {url}")
        except Exception as e:
            print(f"  !! Request error: {e}")

        time.sleep(2 + attempt)

    print(f"  !! Failed after retries: {url}")
    return None


# ---------------------------------------------------------
# TRANSCRIPT EXTRACTOR (JSON → HTML FALLBACK)
# ---------------------------------------------------------
def extract_transcript(html):
    if not html:
        return None

    # ---- JSON transcript inside window.__INITIAL_STATE__
    m = re.search(r"window\.__INITIAL_STATE__\s*=\s*(\{.*?\})\s*;<", html, re.DOTALL)
    if m:
        try:
            state_json = json.loads(m.group(1))
        except:
            state_json = None

        transcription_url = None

        if state_json:
            # Primary JSON path
            try:
                transcription_url = state_json["pageData"]["episode"]["transcriptionUrl"]
            except:
                pass

            # Secondary JSON path
            if not transcription_url:
                try:
                    transcription_url = state_json["episode"]["transcriptionUrl"]
                except:
                    transcription_url = None

        # Fetch JSON transcript
        if transcription_url:
            if transcription_url.startswith("/"):
                transcription_url = "https://podscripts.co" + transcription_url

            tr = safe_get(transcription_url)
            if tr:
                try:
                    tr_json = json.loads(tr)
                    segments = (
                        tr_json.get("transcription", {}).get("segments")
                        or tr_json.get("segments")
                    )

                    if segments:
                        parts = [seg.get("text", "").strip() for seg in segments]
                        full = " ".join(parts)
                        full = re.sub(r"\s+", " ", full).strip()
                        if len(full) > 50:
                            return full
                except:
                    pass

    # ---- HTML fallback
    soup = BeautifulSoup(html, "html.parser")
    spans = soup.select("span.pod_text")
    if spans:
        txt = " ".join([s.get_text(" ", strip=True) for s in spans])
        txt = re.sub(r"\s+", " ", txt).strip()
        if len(txt) > 50:
            return txt

    return None


# ---------------------------------------------------------
# EXTRACT CALL SEGMENT (same as master script)
# ---------------------------------------------------------
def extract_call_segment(full_text, caller_obj, window=1200):
    if not full_text or not caller_obj:
        return None

    name = caller_obj.get("caller_name")
    if not name:
        return None

    # Case-insensitive search
    i = full_text.lower().find(name.lower())
    if i == -1:
        m = re.search(r"calling from", full_text, re.IGNORECASE)
        if not m:
            return None
        i = m.start()

    start = max(0, i - window)
    end = min(len(full_text), i + window)

    seg = full_text[start:end].strip()
    seg = re.sub(r"\s+", " ", seg)
    return seg


# ---------------------------------------------------------
# GPT CALLER EXTRACTION (JSON-COMPLIANT)
# ---------------------------------------------------------
def extract_callers_from_transcript(ep_num, transcript):
    if not transcript:
        return []

    prompt = f"""
You are analyzing a FULL transcript from the podcast "We're Here to Help".

The output must be valid JSON.

Your job: extract ONLY real phone callers.

Ignore:
- hosts (Jake, Gareth)
- guests
- characters/bits
- cold opens

Return ONLY this JSON structure:

{{
  "callers": [
    {{
      "caller_slot": 1,
      "caller_name": "FirstName",
      "caller_location": "City, State/Country or null",
      "reason_short": "short description"
    }}
  ]
}}

Rules:
- Use first names only.
- caller_slot must increment (1,2,3…)
- Skip unclear/non-callers.
- If no callers: return {{"callers": []}}

Transcript for episode {ep_num}:
\"\"\"{transcript}\"\"\"
"""

    for attempt in range(3):
        try:
            resp = client.chat.completions.create(
                model="gpt-4o-mini",
                response_format={"type": "json_object"},
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
            )
            data = json.loads(resp.choices[0].message.content)
            callers = data.get("callers", [])
            if isinstance(callers, list):
                return callers
            return []
        except Exception as e:
            print(f"  !! GPT error (attempt {attempt+1}): {e}")
            time.sleep(2 + attempt)

    return []


# ---------------------------------------------------------
# MAIN — SCRAPE MISSING EPISODES
# ---------------------------------------------------------
def scrape_missing_episodes(
    rss_csv="episodes_master_clean.csv",
    links_csv="episode_links_manual.csv",
    output_csv="missing_episodes_scraped.csv"
):
    print("\nLoading episode metadata…")

    rss = pd.read_csv(rss_csv)
    rss["episode_number"] = pd.to_numeric(rss["episode_number"], errors="coerce")
    rss = rss.dropna(subset=["episode_number"]).copy()
    rss["episode_number"] = rss["episode_number"].astype(int)

    links = pd.read_csv(links_csv)
    links["episode_number"] = pd.to_numeric(links["episode_number"], errors="coerce")
    links = links.dropna(subset=["episode_number"]).copy()
    links["episode_number"] = links["episode_number"].astype(int)

    print("Merging RSS metadata with manual Podscripts links…")

    episodes = rss.merge(links, on="episode_number", how="inner")

    print(f"{len(episodes)} episodes ready to scrape.\n")

    all_rows = []

    for _, row in episodes.iterrows():
        ep = row["episode_number"]
        title = row["title"]
        url = row["correct_url"]

        print(f"\n=== Episode {ep}: {title} ===")
        print("URL:", url)

        html = safe_get(url)
        transcript = extract_transcript(html)

        if transcript:
            print(f"  Transcript length: {len(transcript)} chars")
        else:
            print("  !! No transcript extracted.")

        callers = extract_callers_from_transcript(ep, transcript)
        print(f"  → Callers found: {len(callers)}")

        if not callers:
            all_rows.append({
                "episode_num": ep,
                "episode_title_raw": title,
                "episode_url": url,
                "caller_slot": None,
                "caller_name": None,
                "caller_location": None,
                "reason_short": None,
                "call_segment": None
            })
        else:
            for c in callers:
                seg = extract_call_segment(transcript, c)
                all_rows.append({
                    "episode_num": ep,
                    "episode_title_raw": title,
                    "episode_url": url,
                    "caller_slot": c.get("caller_slot"),
                    "caller_name": c.get("caller_name"),
                    "caller_location": c.get("caller_location"),
                    "reason_short": c.get("reason_short"),
                    "call_segment": seg
                })

        time.sleep(REQUEST_SLEEP)

    df = pd.DataFrame(all_rows)
    df.to_csv(output_csv, index=False)

    print("\nSaved →", output_csv)
    return df


# ---------------------------------------------------------
# RUN
# ---------------------------------------------------------
scrape_missing_episodes()



Loading episode metadata…
Merging RSS metadata with manual Podscripts links…
50 episodes ready to scrape.


=== Episode 215: 215: BEST ADVICE Vol 2: Make It Your Own (with Cat Reitman) ===
URL: https://podscripts.co/podcasts/were-here-to-help/215-best-advice-vol-2-make-it-your-own-with-cat-reitman?scroll_to_words=215&search_type=basic
  Transcript length: 48776 chars
  → Callers found: 3

=== Episode 214: 214: Too Wholesome for Doggy & Ethiopian Jazz ===
URL: https://podscripts.co/podcasts/were-here-to-help/214-too-wholesome-for-doggy-ethiopian-jazz?scroll_to_words=214&search_type=basic
  Transcript length: 62136 chars
  → Callers found: 3

=== Episode 213: 213: Trash Hole Shark & Wrestling Gators (with Andy Roddick) ===
URL: https://podscripts.co/podcasts/were-here-to-help/213-trash-hole-shark-wrestling-gators-with-andy-roddick?scroll_to_words=213&search_type=basic
  Transcript length: 59574 chars
  → Callers found: 2

=== Episode 212: 212: Sh*t Show & The Fancy Dorris Football Leagu

Unnamed: 0,episode_num,episode_title_raw,episode_url,caller_slot,caller_name,caller_location,reason_short,call_segment
0,215,215: BEST ADVICE Vol 2: Make It Your Own (with...,https://podscripts.co/podcasts/were-here-to-he...,1.0,Whitney,,Struggled with the consistency of oatmeal and ...,"Kings, Steve and Eric. The best moment from th..."
1,215,215: BEST ADVICE Vol 2: Make It Your Own (with...,https://podscripts.co/podcasts/were-here-to-he...,2.0,Katie,"Raleigh, North Carolina",Used advice from the show to make flossing fun...,ause it was so precarious. Make it your own. T...
2,215,215: BEST ADVICE Vol 2: Make It Your Own (with...,https://podscripts.co/podcasts/were-here-to-he...,3.0,Sophie,,Had two job offers and received advice to ask ...,morning. And I remember we went to a bar close...
3,214,214: Too Wholesome for Doggy & Ethiopian Jazz,https://podscripts.co/podcasts/were-here-to-he...,1.0,Jordan,Florida,Friend wants a Disney tattoo over a tramp stamp,"ot to tell her. And then the caller was like, ..."
4,214,214: Too Wholesome for Doggy & Ethiopian Jazz,https://podscripts.co/podcasts/were-here-to-he...,2.0,Ashley,San Diego,Trying to avoid a timeshare sales presentation,hem know that we sent you after checkout. Lisa...
...,...,...,...,...,...,...,...,...
135,168,"168: I Feel Barfy & An I Love You, Man Situation",https://podscripts.co/podcasts/were-here-to-he...,4.0,Stone,"Stanton, Virginia",Discussing perceptions of having a foot fetish,d free way to support the podcast. Take you tw...
136,167,167: RE-RELEASE: Fan Faves: Chronicles of Conor,https://podscripts.co/podcasts/were-here-to-he...,1.0,Connor,"Florida, USA",Living with senior roommates and dealing with ...,me markets for a limited time only. Calling al...
137,167,167: RE-RELEASE: Fan Faves: Chronicles of Conor,https://podscripts.co/podcasts/were-here-to-he...,2.0,Carly,"New York, USA",Father refuses to use air conditioning in hot ...,"n you're hot You could say like hey, man. Do y..."
138,166,"166: No Shoes, No Shirt, Teeth In & Unwrapping...",https://podscripts.co/podcasts/were-here-to-he...,1.0,Tammy,California,Employee stopped wearing dentures,"ah, blah, blah, blah. Oh, cool, okay. Well, yo..."
