In [1]:
import requests
import pandas as pd

## **STEP 1: FETCHING MOVIE IDS**

In [None]:
def fetch_movie_ids(api_key, total_pages=500):
    base_url = "https://api.themoviedb.org/3/discover/movie"
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    movie_ids = []
    for page in range(1, total_pages + 1):
        params = {"page": page}
        response = requests.get(base_url, headers=headers, params=params)

        if response.status_code == 200: #If Request sucessfull then proceeds
            data = response.json()
            for result in data['results']: # Extracting movie IDs and append to the list
                movie_ids.append(result['id'])
        else:
            print(f"Failed to fetch data for page {page}: {response.status_code}")
            break  # Optionally, stop fetching if there's an error

        # Optional: print progress
        if page % 10 == 0:
            print(f"Fetched page {page}/{total_pages}")
    return movie_ids


api_key = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJhZTc4MDU1NDdiNDM2MzcxNDc2NGI5N2E3N2ViMzY5ZSIsInN1YiI6IjY2NTVkNDA3MjcyZWQ0NmYzYjIxMjg4NSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.JAxwirmIdSp0tsGQi9vMqZjom2sBUyPe5yOXf9wukZc"

movie_ids = fetch_movie_ids(api_key, total_pages=500) #Only 500 pages contains all movies data

print(f"\nTotal movie IDs fetched: {len(movie_ids)}")


Fetched page 10/500
Fetched page 20/500
Fetched page 30/500
Fetched page 40/500
Fetched page 50/500
Fetched page 60/500
Fetched page 70/500
Fetched page 80/500
Fetched page 90/500
Fetched page 100/500
Fetched page 110/500
Fetched page 120/500
Fetched page 130/500
Fetched page 140/500
Fetched page 150/500
Fetched page 160/500
Fetched page 170/500
Fetched page 180/500
Fetched page 190/500
Fetched page 200/500


## **STEP2: CREATING DATAFRAME FUNCTIONS**

### **1. Main Dataframe**

In [10]:
# Define the function to create a DataFrame
def create_movie_dataframe(movie_id, movie_data_list, existing_df=None):

    def extract_genres(genres):
        return ', '.join([genre['name'] for genre in genres])

    def extract_production_companies(companies):
        return ', '.join([company['name'] for company in companies])

    def extract_production_countries(countries):
        return ', '.join([country['name'] for country in countries])

    def extract_spoken_languages(languages):
        return ', '.join([language['english_name'] for language in languages])

    formatted_movie_data = []

    for movie in movie_data_list:
        formatted_movie_data.append({
            'movie_id': movie_id,  # Add movie_id here
            'title': movie.get('title'),
            'release_date': movie.get('release_date'),
            'budget': movie.get('budget'),
            'revenue': movie.get('revenue'),
            'runtime': movie.get('runtime'),
            'genres': extract_genres(movie.get('genres', [])),    # Empty list handle cases where the 'genres' key may not exist in the dictionary,
            'homepage': movie.get('homepage'),                    # therefore setting it default value if no key found.
            'imdb_id': movie.get('imdb_id'),
            'original_language': movie.get('original_language'),
            'original_title': movie.get('original_title'),
            'overview': movie.get('overview'),
            'popularity': movie.get('popularity'),
            'production_companies': extract_production_companies(movie.get('production_companies', [])),
            'production_countries': extract_production_countries(movie.get('production_countries', [])),
            'spoken_languages': extract_spoken_languages(movie.get('spoken_languages', [])),
            'status': movie.get('status'),
            'tagline': movie.get('tagline'),
            'vote_average': movie.get('vote_average'),
            'vote_count': movie.get('vote_count'),
        })

    df = pd.DataFrame(formatted_movie_data)

    # If an existing DataFrame is provided, append the new data to it
    if existing_df is not None:
        df = pd.concat([existing_df, df], ignore_index=True)
    return df

### **2. Credit Dataframe**

In [11]:
# Define the function to create a DataFrame for credits
def create_credits_dataframe(credits_data, existing_df=None):
    formatted_credits_data = [{
        'movie_id': credits_data['id'],
        'cast': credits_data['cast'],
        'crew': credits_data['crew']
    }]
    df = pd.DataFrame(formatted_credits_data)

    # If an existing DataFrame is provided, append the new data to it
    if existing_df is not None:
        df = pd.concat([existing_df, df], ignore_index=True)

    return df

### **3. Keyword Dataframe**

In [12]:
# Define the function to create a DataFrame for keywords
def create_keywords_dataframe(movie_id, movie_title, keywords_data, existing_df=None):
    keywords_list = [keyword['name'] for keyword in keywords_data['keywords']]

    formatted_keywords_data = [{
        'movie_id': movie_id,
        'movie_title': movie_title,
        'keywords': keywords_list
    }]
    df = pd.DataFrame(formatted_keywords_data)

    # If an existing DataFrame is provided, append the new data to it
    if existing_df is not None:
        df = pd.concat([existing_df, df], ignore_index=True)
    return df

### **STEP 4: ITERATING OVER THE MOVIE ID LIST CREATED ABOVE**

1.     Ref for movie details: https://developer.themoviedb.org/reference/movie-details
2.     Ref for movie credits: https://developer.themoviedb.org/reference/movie-credits
3.  Ref for movie keywords: https://developer.themoviedb.org/reference/movie-keywords

In [13]:
movie_df = None
credits_df = None
keywords_df = None

In [None]:
# movie1 = movie_ids[3000:4000]

api_key = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJhZTc4MDU1NDdiNDM2MzcxNDc2NGI5N2E3N2ViMzY5ZSIsInN1YiI6IjY2NTVkNDA3MjcyZWQ0NmYzYjIxMjg4NSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.JAxwirmIdSp0tsGQi9vMqZjom2sBUyPe5yOXf9wukZc"

for movie_id in movie_ids:  # movie_ids is the list created above

    # HITTING API TO RETRIEVE MOVIE DATA

    movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}"               # API START
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    response = requests.get(movie_url, headers=headers)

    if response.status_code == 200:                                            # Proceed if sucessfull
        movie_data = response.json()                                           # API END
        movie_data_list = [movie_data]                                         # Convert single movie data to a list
        movie_df = create_movie_dataframe(movie_id,movie_data_list, movie_df)  # Create or update DataFrame for movie data through function we created above
    else:
        print(f"Failed to fetch movie data for ID {movie_id}: {response.status_code}")


    # HITTING API TO RETRIEVE MOVIE CREDITS

    credits_url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits"     # API START
    response = requests.get(credits_url, headers=headers)

    if response.status_code == 200:                                            # Proceed if sucessfull
        credits_data = response.json()                                         # API END
        credits_df=create_credits_dataframe(credits_data, credits_df)          # Create or update DataFrame for movie data through function we created above
    else:
        print(f"Failed to fetch credits data for movie ID {movie_id}: {response.status_code}")


    # HITTING API TO RETRIEVE MOVEI KEYWORDS

    keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords"   # API START
    response = requests.get(keywords_url, headers=headers)

    if response.status_code == 200:                                            # Proceed if sucessfull
        keywords_data = response.json()                                        # API END
        movie_title = movie_data['title']                                      # Extract movie title
        keywords_df = create_keywords_dataframe(movie_id, movie_title, keywords_data, keywords_df) # Create or update DataFrame for movie keywords
    else:
        print(f"Failed to fetch keywords data for movie ID {movie_id}: {response.status_code}")


print("Movie DataFrame Sucessfully Created")
print("\nCredits DataFrame Sucessfully Created")
print("\nKeywords DataFrame Sucessfully Created")

Movie DataFrame Sucessfully Created

Credits DataFrame Sucessfully Created

Keywords DataFrame Sucessfully Created


In [57]:
movie_df.head()

Unnamed: 0,movie_id,title,release_date,budget,revenue,runtime,genres,homepage,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,spoken_languages,status,tagline,vote_average,vote_count
0,823464,Godzilla x Kong: The New Empire,2024-03-27,150000000,558503759,115,"Science Fiction, Action, Adventure",https://www.godzillaxkongmovie.com,tt14539740,en,Godzilla x Kong: The New Empire,"Following their explosive showdown, Godzilla a...",3853.79,Legendary Pictures,United States of America,English,Released,Rise together or fall alone.,7.275,2215
1,929590,Civil War,2024-04-10,50000000,113069206,109,"War, Action, Drama",https://a24films.com/films/civil-war,tt17279496,en,Civil War,"In the near future, a group of war journalists...",1977.095,"DNA Films, IPR.VC, A24","Finland, United Kingdom, United States of America",English,Released,Welcome to the frontline.,7.268,947
2,653346,Kingdom of the Planet of the Apes,2024-05-08,160000000,298000000,145,"Science Fiction, Adventure, Action",https://www.20thcenturystudios.com/movies/king...,tt11389872,en,Kingdom of the Planet of the Apes,Several generations in the future following Ca...,2592.421,"20th Century Studios, Oddball Entertainment, J...",United States of America,English,Released,No one can stop the reign.,7.178,607
3,746036,The Fall Guy,2024-04-24,125000000,145744920,126,"Action, Comedy",https://www.thefallguymovie.com,tt1684562,en,The Fall Guy,"Fresh off an almost career-ending accident, st...",2319.708,"87North Productions, Entertainment 360, Univer...",United States of America,"English, French",Released,Fall hard.,7.338,775
4,786892,Furiosa: A Mad Max Saga,2024-05-22,150000000,25550000,149,"Action, Adventure, Science Fiction",https://www.furiosaamadmaxsaga.com,tt12037194,en,Furiosa: A Mad Max Saga,"As the world fell, young Furiosa is snatched f...",1464.114,"Warner Bros. Pictures, Kennedy Miller Mitchell...","Australia, United States of America",English,Released,Fury is born.,7.744,352


In [None]:
# Exporting dataframes to csv
movie_df.to_csv('movie_dataframe.csv', index=False)
credits_df.to_csv('credits_dataframe.csv', index=False)
keywords_df.to_csv('keywords_dataframe.csv', index=False)