In [1]:
import requests
import json
import pandas as pd
import aiohttp
import asyncio
import nest_asyncio
import tracemalloc

In [2]:
#Get top rated movies from API
def top_rated_movies(api_key):
    base_url = "https://api.themoviedb.org/3/movie/top_rated"
    language = "en-US"

    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    top_rated_list = []

    try:
        # Initial request to get total_pages
        response = requests.get(base_url, params={"language": language, "page": 1}, headers=headers)
        
        #response.raise_for_status()
        data = response.json()
        
        #get total pages
        total_pages = data["total_pages"]
        
        #total_pages = 50

        # Loop through all pages
        for page in range(1, total_pages + 1):
            response = requests.get(base_url, params={"language": language, "page": page}, headers=headers)
            response.raise_for_status()
            data = response.json()

            for movie in data["results"]:
                top_rated_dict = {
                    'id': movie['id'],
                     'genre_ids': movie['genre_ids'],
                    'original_language': movie['original_language'],     
                    'original_title': movie['original_title'],
                    'title': movie['title'],
                    'release_date': movie['release_date'],
                    'adult': movie['adult'],
                    'popularity': movie['popularity'],
                    'vote_average': movie['vote_average'],
                    'vote_count': movie['vote_count']
                }

                top_rated_list.append(top_rated_dict)

    except requests.exceptions.RequestException as e:
        print(f"Error in API request: {e}")
        return None

    return pd.DataFrame(top_rated_list)

In [3]:
api_key = ""
df = top_rated_movies(api_key)

df

Unnamed: 0,id,genre_ids,original_language,original_title,title,release_date,adult,popularity,vote_average,vote_count
0,278,"[18, 80]",en,The Shawshank Redemption,The Shawshank Redemption,1994-09-23,False,141.057,8.711,25455
1,238,"[18, 80]",en,The Godfather,The Godfather,1972-03-14,False,118.977,8.708,19375
2,240,"[18, 80]",en,The Godfather Part II,The Godfather Part II,1974-12-20,False,84.422,8.591,11689
3,424,"[18, 36, 10752]",en,Schindler's List,Schindler's List,1993-12-15,False,71.107,8.572,15061
4,389,[18],en,12 Angry Men,12 Angry Men,1957-04-10,False,48.275,8.547,7988
...,...,...,...,...,...,...,...,...,...,...
9129,13805,"[35, 878]",en,Disaster Movie,Disaster Movie,2008-08-29,False,27.506,3.223,976
9130,5491,"[28, 12, 878]",en,Battlefield Earth,Battlefield Earth,2000-05-12,False,19.555,3.218,781
9131,11059,"[27, 28, 53]",en,House of the Dead,House of the Dead,2003-04-11,False,12.983,3.128,366
9132,14164,"[28, 12, 14, 878, 53]",en,Dragonball Evolution,Dragonball Evolution,2009-03-12,False,18.847,2.901,1950


In [4]:
movie_ids = df['id'].tolist()

In [5]:
#get movie details from API

nest_asyncio.apply()

#tracemalloc.start()

async def fetch_movie_info(session, api_key, movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US"
    
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    async with session.get(url, headers=headers) as response:
        if response.status == 200:
            movie_data = await response.json()

            movie_id = movie_data.get('id')
            movie_budget = movie_data.get('budget')
            movie_revenue = movie_data.get('revenue')
            movie_runtime = movie_data.get('runtime')

            return {
                'movie_id': movie_id,
                'movie_budget': movie_budget,
                'movie_revenue': movie_revenue,
                'movie_runtime': movie_runtime
            }

        elif response.status == 429:
            print(f"Rate limit exceeded. Retrying after a delay for movie ID {movie_id}")
            await asyncio.sleep(60)  # You can adjust the delay as needed
            return await fetch_movie_info(session, api_key, movie_id)

        else:
            print(f"Error for movie ID {movie_id}: {response.status}")
            try:
                error_content = await response.text()
                print(f"Error content: {error_content}")
            except Exception as e:
                print(f"Failed to retrieve error content: {e}")
            return None

async def get_movie_info_async(movie_ids, api_key):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_movie_info(session, api_key, movie_id) for movie_id in movie_ids]
        movie_details_list = await asyncio.gather(*tasks)

    return pd.DataFrame(movie_details_list)

def run_async_code(api_key, movie_ids):
    result_df = asyncio.run(get_movie_info_async(movie_ids, api_key))
    result_df.to_csv(r'C:\Users\PESH\Desktop\DE Data Sets\TMDB\movie_details.csv', index=False)   
    return result_df

# call function
api_key = api_key
movie_ids = movie_ids 

run_async_code(api_key, movie_ids)

Unnamed: 0,movie_id,movie_budget,movie_revenue,movie_runtime
0,278,25000000,28341469,142
1,238,6000000,245066411,175
2,240,13000000,102600000,202
3,424,22000000,321365567,195
4,389,350000,1000000,97
...,...,...,...,...
9129,13805,25000000,14109284,87
9130,5491,44000000,29725663,117
9131,11059,12000000,13818181,90
9132,14164,30000000,58228460,85


In [6]:
#get genre info from API

nest_asyncio.apply()

async def fetch_genre_info(session, api_key, movie_id, genre_data_list):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US"
    
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    async with session.get(url, headers=headers) as response:
        if response.status == 200:
            data = await response.json()

            genre_lists = data.get('genres')
            
            # Initialize production_id before the loop
            genre_id = None
            
            for genre in genre_lists:
                genre_id = genre['id']
                genre_name = genre['name']
    
                genre_data_list.append({'movie_id': movie_id, 'genre_id': genre_id, 'genre_name': genre_name})
            
            # Move the return statement inside the if block to handle an empty loop
            if genre_id is not None:
                return {
                    'movie_id': movie_id,
                    'genre_id': genre_id,
                    'genre_name': genre_name
                }
            else:
                return None

        else:
            print(f"Error for movie ID {movie_id}: {response.status}")
            try:
                error_content = await response.text()
                print(f"Error content: {error_content}")
            except Exception as e:
                print(f"Failed to retrieve error content: {e}")
            return None


async def get_genre_info_async(movie_ids, api_key):
    genre_data_list = []

    async with aiohttp.ClientSession() as session:
        tasks = [fetch_genre_info(session, api_key, movie_id, genre_data_list) for movie_id in movie_ids]
        await asyncio.gather(*tasks)

    return pd.DataFrame(genre_data_list)

def run_async_code(api_key, movie_ids):
    result_df = asyncio.run(get_genre_info_async(movie_ids, api_key))
    result_df.to_csv(r'C:\Users\PESH\Desktop\DE Data Sets\TMDB\genre_details.csv', index=False)
    return result_df

# call function
api_key = ""
movie_ids = movie_ids 
run_async_code(api_key, movie_ids)

Unnamed: 0,movie_id,genre_id,genre_name
0,497,14,Fantasy
1,497,18,Drama
2,497,80,Crime
3,14537,28,Action
4,14537,18,Drama
...,...,...,...
23928,482981,18,Drama
23929,1635,28,Action
23930,1635,53,Thriller
23931,1635,878,Science Fiction


In [7]:
#get cast_info from API

nest_asyncio.apply()

async def fetch_cast_info(session, api_key, movie_id, cast_data_list):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits?language=en-US"
  
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    async with session.get(url, headers=headers) as response:
        if response.status == 200:
            data = await response.json()

            # Check if 'cast' key exists in data
            casts_list = data.get('cast')

            if casts_list is not None:
                for cast in casts_list:
                    if cast['known_for_department'] == 'Acting' or cast['known_for_department'] == 'Directing':
                        cast_id = cast['id'] 
                        cast_name = cast['name']
                        cast_gender = cast['gender']
                        cast_role = cast['known_for_department'] 
                        character_id = cast['cast_id']  # Use 'id' instead of 'cast_id'
                        character_name = cast['character']
                        cast_order = cast['order']

                        cast_data_list.append({
                            'movie_id': movie_id,
                            'cast_id': cast_id,
                            'cast_name': cast_name,
                            'cast_gender': cast_gender,
                            'cast_role': cast_role,
                            'character_id': character_id,
                            'character_name': character_name,
                            'cast_order': cast_order
                        })

                # Move the return statement outside the loop
                return cast_data_list
            else:
                print(f"No cast information found for movie ID {movie_id}")

        else:
            print(f"Error for movie ID {movie_id}: {response.status}")
            print(await response.text())  # Print the response text for additional details


async def get_cast_info_async(movie_ids, api_key):
    cast_data_list = []

    async with aiohttp.ClientSession() as session:
        for movie_id in movie_ids:
            cast_info = await fetch_cast_info(session, api_key, movie_id, cast_data_list)
            #if cast_info:
                # Add a delay after processing each movie ID
                #await asyncio.sleep(5)  # Adjust the delay time as needed

    return pd.DataFrame(cast_data_list)

def run_async_code(api_key, movie_ids):
    result_df = asyncio.run(get_cast_info_async(movie_ids, api_key))
    result_df.to_csv(r'C:\Users\PESH\Desktop\DE Data Sets\TMDB\cast_details.csv', index=False)
    return result_df

# call function
api_key = ""
movie_ids = movie_ids 

run_async_code(api_key, movie_ids)


Unnamed: 0,movie_id,cast_id,cast_name,cast_gender,cast_role,character_id,character_name,cast_order
0,278,504,Tim Robbins,2,Acting,3,Andy Dufresne,0
1,278,192,Morgan Freeman,2,Acting,4,Ellis Boyd 'Red' Redding,1
2,278,4029,Bob Gunton,2,Acting,5,Warden Norton,2
3,278,6573,William Sadler,2,Acting,7,Heywood,3
4,278,6574,Clancy Brown,2,Acting,8,Captain Byron T. Hadley,4
...,...,...,...,...,...,...,...,...
324316,40016,4350232,Patrick Donahue,0,Acting,51,NCT Software Salesperson,39
324317,40016,63173,Steve McMoy,0,Acting,52,Robert Perkins,40
324318,40016,4350238,Jaime Soria,0,Acting,53,Venture Capitalist,41
324319,40016,4350239,Jerry Madison,0,Acting,54,Venture Capitalist,42


In [8]:
#get production data from API

nest_asyncio.apply()

async def fetch_production_info(session, api_key, movie_id, production_company_list):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US"
    
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    async with session.get(url, headers=headers) as response:
        if response.status == 200:
            data = await response.json()

            production_companies = data.get('production_companies')
            
            # Initialize production_id before the loop
            production_id = None
            
            for company in production_companies:
                production_id = company['id']
                production_name = company['name']
                origin_country = company['origin_country']

                production_company_list.append({'movie_id': movie_id, 'production_id': production_id, 'production_name': production_name,'origin_country':origin_country})
            
            # Move the return statement inside the if block to handle an empty loop
            if production_id is not None:
                return {
                    'movie_id': movie_id,
                    'production_id': production_id,
                    'production_name': production_name,
                    'origin_country': origin_country
                }
            else:
                return None

        else:
            print(f"Error for movie ID {movie_id}: {response.status}")
            return None


async def get_production_info_async(movie_ids, api_key):
    production_company_list = []

    async with aiohttp.ClientSession() as session:
        tasks = [fetch_production_info(session, api_key, movie_id, production_company_list) for movie_id in movie_ids]
        await asyncio.gather(*tasks)

    return pd.DataFrame(production_company_list)

def run_async_code(api_key, movie_ids):
    result_df = asyncio.run(get_production_info_async(movie_ids, api_key))
    result_df.to_csv(r'C:\Users\PESH\Desktop\DE Data Sets\TMDB\production_details.csv', index=False)
    return result_df

# call function
api_key = ""
movie_ids = movie_ids 

run_async_code(api_key, movie_ids)


Unnamed: 0,movie_id,production_id,production_name,origin_country
0,105,33,Universal Pictures,US
1,105,56,Amblin Entertainment,US
2,207,9195,Touchstone Pictures,US
3,207,10282,Silver Screen Partners IV,US
4,207,184460,A Steven Haft Production,
...,...,...,...,...
32152,13727,2490,LGM Productions,
32153,13727,356,TF1 Films Production,FR
32154,15268,1172,Hyde Park Films,US
32155,15268,25,20th Century Fox,US
