In [11]:
import requests
import pandas as pd
import time
import os
from urllib.request import urlretrieve

# --- CONFIG ---
API_KEY = 'd8ae884deab9fe57f68e1f96fbd8aad3'  # Replace with your actual API key
BASE_URL = 'https://api.themoviedb.org/3'
POSTER_BASE = 'https://image.tmdb.org/t/p/w500'
PAGES = 20  # Number of pages to fetch
SAVE_DIR = 'tmdb_data'  # Directory to save all files

# Create save directory if it doesn't exist
os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(os.path.join(SAVE_DIR, 'posters'), exist_ok=True)

def get_genre_mapping():
    """Fetch TV genre mapping from TMDB"""
    url = f"{BASE_URL}/genre/tv/list"
    params = {'api_key': API_KEY}
    res = requests.get(url, params=params)
    res.raise_for_status()
    return {g['id']: g['name'] for g in res.json()['genres']}

def download_image(url, save_path):
    """Download and save an image from URL"""
    try:
        urlretrieve(url, save_path)
        return True
    except Exception as e:
        print(f"Failed to download {url}: {str(e)}")
        return False

def get_tv_shows(pages=1):
    """Fetch TV shows from TMDB"""
    shows = []
    url = f"{BASE_URL}/discover/tv"
    
    for page in range(1, pages + 1):
        print(f"Fetching page {page}/{pages}")
        params = {
            'api_key': API_KEY,
            'language': 'en-US',
            'sort_by': 'popularity.desc',
            'page': page,
        }
        
        try:
            res = requests.get(url, params=params)
            res.raise_for_status()
            data = res.json()
            
            for show in data['results']:
                poster_path = show.get('poster_path')
                poster_url = f"{POSTER_BASE}{poster_path}" if poster_path else None
                poster_file = os.path.join('posters', f"{show['id']}.jpg") if poster_path else None
                
                if poster_url:
                    download_image(poster_url, os.path.join(SAVE_DIR, poster_file))
                
                shows.append({
                    'id': show['id'],
                    'title': show['name'],
                    'overview': show.get('overview'),
                    'popularity': show.get('popularity'),
                    'vote_average': show.get('vote_average'),
                    'first_air_date': show.get('first_air_date'),
                    'poster_file': poster_file,  # Local path to the downloaded image
                    'genres': show.get('genre_ids', []),
                })
                
            time.sleep(0.3)  # Respect rate limits
        except Exception as e:
            print(f"Error on page {page}: {str(e)}")
            continue
            
    return shows

def main():
    try:
        print("Starting TMDB TV show data collection...")
        
        # Get genre mapping
        genre_map = get_genre_mapping()
        
        # Get TV shows
        shows = get_tv_shows(PAGES)
        
        # Create DataFrame
        df = pd.DataFrame(shows)
        
        # Map genre IDs to names
        df['genres'] = df['genres'].apply(
            lambda ids: ', '.join([genre_map.get(i, 'Unknown') for i in ids])
        )
        
        # Save metadata to CSV
        csv_path = os.path.join(SAVE_DIR, 'tv_shows_metadata.csv')
        df.to_csv(csv_path, index=False)
        print(f"\nSuccessfully saved {len(df)} shows to:")
        print(f"- Metadata: {csv_path}")
        print(f"- Posters: {os.path.join(SAVE_DIR, 'posters')}")
        
        # Display sample
        print("\nSample data:")
        print(df[['id', 'title', 'genres', 'vote_average']].head(3).to_string(index=False))
        
    except Exception as e:
        print(f"\nScript failed: {str(e)}")

if __name__ == "__main__":
    main()

Starting TMDB TV show data collection...
Fetching page 1/20
Fetching page 2/20
Fetching page 3/20
Fetching page 4/20
Fetching page 5/20
Fetching page 6/20
Fetching page 7/20
Fetching page 8/20
Fetching page 9/20
Fetching page 10/20
Fetching page 11/20
Fetching page 12/20
Fetching page 13/20
Fetching page 14/20
Fetching page 15/20
Fetching page 16/20
Fetching page 17/20
Fetching page 18/20
Fetching page 19/20
Fetching page 20/20

Successfully saved 400 shows to:
- Metadata: tmdb_data/tv_shows_metadata.csv
- Posters: tmdb_data/posters

Sample data:
   id                                   title       genres  vote_average
 2261 The Tonight Show Starring Johnny Carson         Talk         7.410
22980 Watch What Happens Live with Andy Cohen Talk, Comedy         5.000
59941  The Tonight Show Starring Jimmy Fallon Comedy, Talk         5.865


In [17]:
import requests
import pandas as pd
import time
import os
from urllib.request import urlretrieve

# --- CONFIG ---
API_KEY = 'd8ae884deab9fe57f68e1f96fbd8aad3'  # Replace with your actual API key
BASE_URL = 'https://api.themoviedb.org/3'
POSTER_BASE = 'https://image.tmdb.org/t/p/w500'
PAGES = 50  # Pages to fetch (each has ~20 shows)
START_DATE = '2015-01-01'
END_DATE = '2025-04-02'
SAVE_DIR = 'tmdb_data'  # Output directory

# Create directories
os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(os.path.join(SAVE_DIR, 'posters'), exist_ok=True)

def get_genre_mapping():
    """Fetch genre ID to name mapping"""
    url = f"{BASE_URL}/genre/tv/list"
    params = {'api_key': API_KEY}
    res = requests.get(url, params=params)
    res.raise_for_status()
    return {g['id']: g['name'] for g in res.json()['genres']}

def download_image(url, save_path):
    """Download image from URL"""
    try:
        urlretrieve(url, save_path)
        return True
    except Exception as e:
        print(f"Failed to download {url}: {str(e)}")
        return False

def get_tv_shows(pages=1, start_date='2000-01-01', end_date='2020-12-31'):
    """Fetch TV shows using first air date filtering"""
    shows = []
    url = f"{BASE_URL}/discover/tv"

    for page in range(1, pages + 1):
        print(f"Fetching page {page}/{pages}")
        params = {
            'api_key': API_KEY,
            'language': 'en-US',
            'sort_by': 'popularity.desc',
            'page': page,
            'first_air_date.gte': start_date,
            'first_air_date.lte': end_date,
        }

        try:
            res = requests.get(url, params=params)
            res.raise_for_status()
            data = res.json()

            for show in data['results']:
                poster_path = show.get('poster_path')
                poster_url = f"{POSTER_BASE}{poster_path}" if poster_path else None
                poster_file = os.path.join('posters', f"{show['id']}.jpg") if poster_path else None

                if poster_url:
                    download_image(poster_url, os.path.join(SAVE_DIR, poster_file))

                shows.append({
                    'id': show['id'],
                    'title': show['name'],
                    'overview': show.get('overview'),
                    'popularity': show.get('popularity'),
                    'vote_average': show.get('vote_average'),
                    'first_air_date': show.get('first_air_date'),
                    'poster_file': poster_file,
                    'genres': show.get('genre_ids', []),
                })

            time.sleep(0.3)  # Respect TMDb rate limits
        except Exception as e:
            print(f"Error on page {page}: {str(e)}")
            continue

    return shows

def main():
    try:
        print(f"Starting data collection from {START_DATE} to {END_DATE}...")

        # Genre mapping
        genre_map = get_genre_mapping()

        # Fetch TV shows
        shows = get_tv_shows(pages=PAGES, start_date=START_DATE, end_date=END_DATE)

        # Create DataFrame
        df = pd.DataFrame(shows)

        # Map genres to names
        df['genres'] = df['genres'].apply(lambda ids: ', '.join([genre_map.get(i, 'Unknown') for i in ids]))

        # Save CSV
        csv_path = os.path.join(SAVE_DIR, 'tv_shows_metadata.csv')
        df.to_csv(csv_path, index=False)

        print(f"\n✅ Saved {len(df)} shows to:")
        print(f"- Metadata: {csv_path}")
        print(f"- Posters: {os.path.join(SAVE_DIR, 'posters')}")

        print("\n🎬 Sample data:")
        print(df[['id', 'title', 'genres', 'vote_average']].head(5).to_string(index=False))

    except Exception as e:
        print(f"❌ Script failed: {str(e)}")

if __name__ == "__main__":
    main()

Starting data collection from 2015-01-01 to 2025-04-02...
Fetching page 1/50
Fetching page 2/50
Fetching page 3/50
Fetching page 4/50
Fetching page 5/50
Fetching page 6/50
Fetching page 7/50
Fetching page 8/50
Fetching page 9/50
Fetching page 10/50
Fetching page 11/50
Fetching page 12/50
Fetching page 13/50
Fetching page 14/50
Fetching page 15/50
Fetching page 16/50
Fetching page 17/50
Fetching page 18/50
Fetching page 19/50
Fetching page 20/50
Fetching page 21/50
Fetching page 22/50
Fetching page 23/50
Fetching page 24/50
Fetching page 25/50
Fetching page 26/50
Fetching page 27/50
Fetching page 28/50
Fetching page 29/50
Fetching page 30/50
Fetching page 31/50
Fetching page 32/50
Fetching page 33/50
Fetching page 34/50
Fetching page 35/50
Fetching page 36/50
Fetching page 37/50
Fetching page 38/50
Fetching page 39/50
Fetching page 40/50
Fetching page 41/50
Fetching page 42/50
Fetching page 43/50
Fetching page 44/50
Fetching page 45/50
Fetching page 46/50
Fetching page 47/50
Fetching pa