In [10]:
import requests
import pandas as pd
import os
from dotenv import load_dotenv
from tqdm import tqdm
import time
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
)

import matplotlib.pyplot as plt
import seaborn as sns

load_dotenv()
API_KEY = os.getenv("TMDB_API_KEY")

BASE_URL = "https://api.themoviedb.org/3"

In [11]:
def get_movies_from_page(page):
    """Fetch one page of movie results."""
    url = f"{BASE_URL}/discover/movie"
    params = {
        "api_key": API_KEY,
        "sort_by": "vote_count.desc",
        "page": page,
        "include_adult": False
    }
    return requests.get(url, params=params).json()

def get_movie_details(movie_id):
    """Fetch metadata for a movie."""
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {"api_key": API_KEY}
    return requests.get(url, params=params).json()

def get_movie_credits(movie_id):
    """Fetch cast/crew info."""
    url = f"{BASE_URL}/movie/{movie_id}/credits"
    params = {"api_key": API_KEY}
    return requests.get(url, params=params).json()

In [12]:
movies = []
PAGES_TO_SCRAPE = 10   # change to 15–20 if you want ~400–600 movies

for page in tqdm(range(1, PAGES_TO_SCRAPE + 1)):
    page_data = get_movies_from_page(page)
    if "results" not in page_data:
        continue
    
    for movie in page_data["results"]:
        movie_id = movie["id"]

        # Fetch additional metadata
        details = get_movie_details(movie_id)
        credits = get_movie_credits(movie_id)
        time.sleep(0.2)  # prevent rate limits

        cast_count = len(credits.get("cast", []))

        movie_record = {
            "id": movie_id,
            "title": details.get("title"),
            "release_year": int(details.get("release_date", "0000")[:4]) if details.get("release_date") else None,
            "budget": details.get("budget"),
            "revenue": details.get("revenue"),
            "runtime": details.get("runtime"),
            "popularity": details.get("popularity"),
            "vote_average": details.get("vote_average"),
            "vote_count": details.get("vote_count"),
            "genre_ids": [g["id"] for g in details.get("genres", [])],
            "production_companies": len(details.get("production_companies", [])),
            "cast_size": cast_count
        }

        movies.append(movie_record)

df = pd.DataFrame(movies)
df.head()

100%|███████████████████████████████████████████| 10/10 [00:01<00:00,  8.52it/s]


In [4]:
# Drop basic missing data
df = df.dropna(subset=["vote_average", "vote_count", "release_year"])

# Feature: Movie age
df["movie_age"] = 2025 - df["release_year"]

# Clean genres → one-hot encode
df["genre_ids"] = df["genre_ids"].apply(lambda x: x if isinstance(x, list) else [])
all_genres = sorted({g for glist in df["genre_ids"] for g in glist})

for g in all_genres:
    df[f"genre_{g}"] = df["genre_ids"].apply(lambda lst: 1 if g in lst else 0)

# Supervised label: timeless movie
df["timeless"] = df.apply(
    lambda row: 1 if (row["vote_average"] > 8.0 and row["vote_count"] > 20000) else 0,
    axis=1
)

df.head()


KeyError: ['vote_average', 'vote_count', 'release_year']

In [17]:
import requests
import os
from dotenv import load_dotenv

load_dotenv()
API_KEY = os.getenv("TMDB_API_KEY")

url = f"https://api.themoviedb.org/3/discover/movie?api_key={API_KEY}&sort_by=vote_count.desc"

response = requests.get(url)
print(response.status_code)
print(response.json())


401
{'status_code': 7, 'status_message': 'Invalid API key: You must be granted a valid key.', 'success': False}


In [18]:
print("API KEY:", API_KEY)


API KEY: None
