# 1. Data Collection


## Data Collection


In [2]:
#Imports
import requests
import time
import pandas as pd
import os
from dotenv import load_dotenv


Helper Funcations and Required variabls

In [3]:
#Required Variables and helper functions

load_dotenv()
API_KEY = os.getenv("TMDB_API_KEY")

BASE_URL = "https://api.themoviedb.org/3"
RAW_DIR = "../data/raw/"

def get_genre_mapping():
    url = f"{BASE_URL}/genre/movie/list"
    response = requests.get(url, params={"api_key": API_KEY})
    response.raise_for_status()
    return {g['id']: g['name'] for g in response.json()['genres']}

# def get_top_movies_for_year(year):
#     all_movies = []
#     for page in range(1, 6):  # 100 movies: 5 pages * 20 movies
#         url = f"{BASE_URL}/discover/movie"
#         params = {
#             "api_key": API_KEY,
#             "sort_by": "popularity.desc",
#             "primary_release_year": year,
#             "page": page
#         }
#         response = requests.get(url, params=params)
#         response.raise_for_status()
#         all_movies.extend(response.json()["results"])
#         time.sleep(0.3)  # Avoid hitting rate limits
#     return all_movies
def get_most_rated_movies_for_year(year):
    all_movies = []
    for page in range(1, 6):  # 5 pages * 20 movies = 100 candidates
        url = f"{BASE_URL}/discover/movie"
        params = {
            "api_key": API_KEY,
            "sort_by": "vote_count.desc",  # Sort by number of ratings
            "primary_release_year": year,
            "page": page
        }
        response = requests.get(url, params=params)
        response.raise_for_status()
        all_movies.extend(response.json()["results"])
        time.sleep(0.3)
    
    # Just keep the top 75 based on vote_count
    all_movies = sorted(all_movies, key=lambda x: x["vote_count"], reverse=True)[:75]
    return all_movies


# Get genre mapping
genre_map = get_genre_mapping()


Calling the TMDb API for data. 

In [4]:
#Calling the api

# Collect data for years 2020–2024
movie_records = []
for year in range(2020, 2025):
    print(f"Collecting {year}...")
    movies = get_most_rated_movies_for_year(year)
    for movie in movies:
        genre_names = [genre_map.get(gid, "Unknown") for gid in movie.get("genre_ids", [])]
        movie_records.append({
            "movie_id": movie["id"],
            "title": movie["title"],
            "year": year,
            "popularity": movie["popularity"],
            "vote_average": movie["vote_average"],
            "vote_count": movie["vote_count"],
            "genre_ids": movie["genre_ids"],
            "genres": genre_names
        })

# Convert to DataFrame
df = pd.DataFrame(movie_records)
df.to_json(f"{RAW_DIR}/movies_2020_2024.json", orient="records", indent=2)
print(f"collected {len(df)} movie data")
print("✅ Data collection complete.")


Collecting 2020...
Collecting 2021...
Collecting 2022...
Collecting 2023...
Collecting 2024...
collected 375 movie data
✅ Data collection complete.
