In [1]:
import pandas as pd
import requests
import concurrent.futures
import threading
import time

class MovieDataFetcher:
    def __init__(self, api_key):
        self.api_key = api_key
        self.lock = threading.Lock()
        self.processed_count = 0 

    def get_movie_details(self, movie_id):
        url = f"https://api.themoviedb.org/3/movie/{movie_id}"
        params = {"api_key": self.api_key, "language": "en-US"}
        
        movie_data = self._make_request(url, params)
        if movie_data is None:
            return None

        movie_name = movie_data.get("title")
        release_date = movie_data.get("release_date")

        credits_url = f"{url}/credits"
        credits_data = self._make_request(credits_url, {"api_key": self.api_key})
        if credits_data is None:
            return None

        directors = [crew_member['name'] for crew_member in credits_data['crew'] if crew_member['job'] == 'Director']
        cast = [cast_member['name'] for cast_member in credits_data['cast']]  

        if not all([movie_name, release_date, directors, cast]):
            return None

        return {
            "Movie ID": movie_id,
            "Movie Name": movie_name,
            "Release Date": release_date,
            "Directors": directors,
            "Cast": cast
        }

    def _make_request(self, url, params):
        response = requests.get(url, params=params)
        if response.status_code != 200:
            return None
        return response.json()

    def process_movie_id(self, movie_id):
        details = self.get_movie_details(movie_id)
        if details:
            self._write_to_file(details)
            with self.lock:
                self.processed_count += 1

    def _write_to_file(self, movie_details):
        with self.lock:
            with open('../movie_details.csv', 'ab') as file: 
                pd.DataFrame([movie_details]).to_csv(file, header=file.tell()==0, index=False, mode='a')

    def _update_progress(self, total_count):
        with self.lock:
            self.processed_count += 1
            if self.processed_count % 1000 == 0 or self.processed_count == total_count:
                time_taken = time.time() - self.batch_start_time
                print(f"Processed {self.processed_count}/{total_count} movies in {time_taken:.2f} seconds")
                self.batch_start_time = time.time()


api_key = "6c72b652e1a5b56a3ef1c5989cf1d128"
fetcher = MovieDataFetcher(api_key)


df_ids = pd.read_csv('cleaned_movie_ids.csv')

with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(fetcher.process_movie_id, df_ids['id'])

print(f"Total movies recorded to dataset: {fetcher.processed_count}")

Total movies recorded to dataset: 554884
