In [3]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv

# Load environment
load_dotenv()
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")

if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
    raise RuntimeError("Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY")

# Supabase REST API config
REST_URL = SUPABASE_URL.rstrip("/") + "/rest/v1/podcast_profiles"
HEADERS = {
    "apikey": SUPABASE_SERVICE_ROLE_KEY,
    "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
}

# Fetch all podcast_profiles from Supabase
rows = []
page_size = 1000
start = 0

print("Fetching podcast_profiles from Supabase...")
while True:
    resp = requests.get(
        REST_URL,
        headers=HEADERS,
        params={"limit": page_size, "offset": start},
        timeout=60,
    )
    if resp.status_code not in (200, 206):
        raise RuntimeError(f"Failed to fetch podcast_profiles: HTTP {resp.status_code} - {resp.text}")
    
    batch = resp.json()
    if not batch:
        break
    
    print(f"Fetched {len(batch)} rows (total: {len(rows) + len(batch)})")
    rows.extend(batch)
    
    if len(batch) < page_size:
        break
    
    start += page_size

print(f"\nTotal rows fetched: {len(rows)}")

# Convert to pandas DataFrame
df = pd.DataFrame(rows)
print(f"\nDataFrame shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()



Fetching podcast_profiles from Supabase...
Fetched 1000 rows (total: 1000)
Fetched 1000 rows (total: 2000)
Fetched 400 rows (total: 2400)

Total rows fetched: 2400

DataFrame shape: (2400, 12)

Columns: ['id', 'rss_feed_url', 'podcast_name', 'supplier_id', 'supplier_name', 'network_id', 'network_name', 'genre', 'status_code', 'created_at', 'updated_at', 'RSS_request_status_code']

First few rows:


Unnamed: 0,id,rss_feed_url,podcast_name,supplier_id,supplier_name,network_id,network_name,genre,status_code,created_at,updated_at,RSS_request_status_code
0,878a543f-5496-45e4-ae18-08da752e0cf7,https://feed.pod.space/45minuteravalversjovide...,45 minuter AV,b3222a28-d195-433c-d8f1-08da752edf7e,Podspace,,,Mat/Dryck,200,2025-10-28T22:22:29.23343+00:00,2025-10-29T01:19:38.344178+00:00,200
1,e513be13-9f80-4c3c-4ecd-08dc8570d9ab,https://feed.pod.space/mianochsarasskrivarlya,Mian och Saras skrivarlya,b3222a28-d195-433c-d8f1-08da752edf7e,Podspace,,,Utbildning,200,2025-10-28T22:40:43.644234+00:00,2025-10-29T01:19:41.390469+00:00,200
2,5762bee5-4e79-4bfa-8837-08dc89033779,https://feeds.acast.com/public/shows/665daacf4...,2 Under 2,4547b65d-ccf7-469c-d8ec-08da752edf7e,Acast,,,Barn/Familj,200,2025-10-28T22:22:24.275026+00:00,2025-10-29T01:19:45.515727+00:00,200
3,b97ebc42-948e-4988-ab11-08da752e0cf7,https://feeds.acast.com/public/shows/c13451b2-...,50 nyanser av pengar,4547b65d-ccf7-469c-d8ec-08da752edf7e,Acast,,,Privatekonomi,200,2025-10-28T22:22:30.092405+00:00,2025-10-29T01:19:48.933804+00:00,200
4,8e68958e-186a-43cb-ad21-08da752e0cf7,https://api.sr.se/api/rss/pod/33855,Mia Blomgren,0d60154d-4998-47b5-d8eb-08da752edf7e,Sveriges Radio,30efa239-0cd5-48b6-378b-08da752edfd0,Sveriges Radio,Nyheter/Politik,200,2025-10-28T22:40:43.265919+00:00,2025-10-29T01:19:58.792341+00:00,200


In [4]:
from urllib.parse import urlparse

# Extract base URLs from RSS feed URLs
def extract_base_url(rss_url):
    """Extract base URL (scheme + netloc) from RSS feed URL"""
    if pd.isna(rss_url) or not rss_url:
        return None
    try:
        parsed = urlparse(rss_url)
        return f"{parsed.scheme}://{parsed.netloc}"
    except:
        return None

df['base_url'] = df['rss_feed_url'].apply(extract_base_url)

# Show unique base URLs and their counts
print("Unique Base URLs and Count:")
print("=" * 80)
base_url_counts = df['base_url'].value_counts()
print(base_url_counts)

print(f"\n\nTotal unique base URLs: {len(base_url_counts)}")
print(f"Null/Missing RSS URLs: {df['rss_feed_url'].isna().sum()}")

# Show the distribution
print("\n\nBase URL Distribution:")
print("=" * 80)
for base_url, count in base_url_counts.head(20).items():
    percentage = (count / len(df)) * 100
    print(f"{base_url:50} {count:5} ({percentage:5.2f}%)")



Unique Base URLs and Count:
base_url
https://feeds.acast.com                   1136
https://feed.pod.space                     521
https://api.sr.se                          272
https://rss.podplaystudio.com              150
https://rss.acast.com                      109
https://podcast.stream.schibsted.media      94
https://access.acast.com                    70
https://feed.khz.se                         20
https://cdn.radioplay.se                    12
http://www.ilikeradio.se                     8
https://www.ilikeradio.se                    8
Name: count, dtype: int64


Total unique base URLs: 11
Null/Missing RSS URLs: 0


Base URL Distribution:
https://feeds.acast.com                             1136 (47.33%)
https://feed.pod.space                               521 (21.71%)
https://api.sr.se                                    272 (11.33%)
https://rss.podplaystudio.com                        150 ( 6.25%)
https://rss.acast.com                                109 ( 4.54%)
https://pod