By Gholamreza Dar - Fall 2022

## Imports

In [1]:
import os
import csv
import pickle
from collections import defaultdict
from IPython.display import display, HTML

# Third party
from dotenv import load_dotenv
import requests
import pandas as pd

## Load TMDB's API_KEY
located in in .env file

In [2]:
load_dotenv()
API_KEY = os.getenv('API_KEY')

## Functions

### Save and Load Utility Functions

In [3]:
def pickling(path, data):
    file = open(path,'wb')
    pickle.dump(data,file)

def unpickling(path):
    file = open(path, 'rb')
    b = pickle.load(file)
    return b

### API requests functions

In [4]:
def get_movie_id(movie_name, movie_year):
    """Gets the movie_id from TMDB using the movie_name and year

    Args:
        movie_name (str): Movie's name
        movie_year (str): Movie's release year

    Returns:
        int: Movie TMDB id
    """

    r = requests.get(f'https://api.themoviedb.org/3/search/movie?api_key={API_KEY}&language=en-US&page=1&include_adult=false', params={'query':movie_name, 'year':movie_year})
    movie_id = r.json()['results'][0]['id']
    return movie_id

def get_movie_actors(movie_name, movie_year, limit=20):
    """Returns the top 'limit' movie actors

    Args:
        movie_name (str): Movie's name
        movie_year (str): Movie's release year
        limit (int, optional): Actors limit. Defaults to 20.

    Returns:
        str[]: A list of movie actors
    """

    actors = []

    movie_id = get_movie_id(movie_name, movie_year)
    r = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={API_KEY}&language=en-US&')

    actors_json = r.json()['cast'][:limit]
    for actor in actors_json:
        actors.append(actor['name'])

    return actors

### Display Functions

In [17]:
def print_actors_map(actors_map, actors_movies_map, limit=20):
    """Prints the data in the console"""

    # Sort the actors based on their total profit
    sorted_actors_map = dict(sorted(actors_map.items(), key=lambda x: x[1], reverse=True))

    idx = 0
    for actor_name, profit in sorted_actors_map.items():
        movies_list = actors_movies_map[actor_name]
        print(idx, actor_name, profit, movies_list)
        idx += 1
        
        # Limit the number of actors to display
        if idx > limit:
            break

def pretty_print(df):
    """Helps to display each movie in a seperated line"""

    return display(HTML(df.to_html().replace("\\n","<br>")))

def print_actors_map_pd(actors_map, actors_movies_map, limit=200, movies_limit=100):
    """Displays the data as a HTML table and returns the df for later use."""

    # Sort the actors based on their total profit
    sorted_actors_map = dict(sorted(actors_map.items(), key=lambda x: x[1], reverse=True))

    # Generate the table
    data = []
    idx = 0
    for actor_name, profit in sorted_actors_map.items():
        movies_list = actors_movies_map[actor_name]
        data.append([actor_name, f"${profit:,}", "\n".join(movies_list[:movies_limit])])
        idx += 1

        # Limit the number of actors to display
        if idx > limit:
            break

    # Convert to df for better display
    df = pd.DataFrame(data, columns=["Actor Name", "Profit", "Movies"])
    pretty_print(df)

    return df

## Gather data

### Let's get the actors from the movie Mother!(2017)

In [6]:
actors= get_movie_actors("mother!", "2017")
actors[:5]

['Jennifer Lawrence',
 'Javier Bardem',
 'Ed Harris',
 'Michelle Pfeiffer',
 'Brian Gleeson']

### Load the best selling movies
from the crawled 'top_movies_list.csv' file

In [8]:
# get movies name and year from csv
movies_list = []
with open('top_movies_list.csv', newline='') as file:
    reader = csv.reader(file)
    for idx, movie in enumerate(reader):
        movies_list.append(movie)
        
print(len(movies_list), "Movies Loaded")

1200 Movies Loaded


In [12]:
# movie[0]: id
# movie[1]: movie_name
# movie[2]: movie_year
# movie[3]: movie_profit
movies_list[0] # 0 means Avatar(2009) as of Fall 2022

['1', 'Avatar', '2009', '$2881837181']

### Load the actors of each movie
ordered according to the movies_list data

In [13]:
# Load the scraped actors_data using the multiprocessing code 'gather_data_multiprocess.py.
actors_data = unpickling("actors_data.pckl")
actors_data[0] # 0 means Avatar(2009) as of Fall 2022

['Sam Worthington',
 'Zoe Saldaña',
 'Sigourney Weaver',
 'Stephen Lang',
 'Michelle Rodriguez',
 'Giovanni Ribisi',
 'Joel David Moore',
 'CCH Pounder',
 'Wes Studi',
 'Laz Alonso',
 'Dileep Rao',
 'Matt Gerald',
 'Sean Anthony Moran',
 'Jason Whyte',
 'Scott Lawrence',
 'Kelly Kilgour',
 'James Patrick Pitt',
 'Sean Patrick Murphy',
 'Peter Dillon',
 'Kevin Dorman']

## Main Juice!

In [14]:
%%time

# Change this to False to recalculate actors_map and actors_movies_map
is_data_available = True

actors_map = defaultdict(int) # a dictionary like this {'Tom Cruise':10_000_000, ...}
actors_movies_map = defaultdict(list) # a dictionary like this {'Tom Cruise': [Top Gun, Mission Impossible, ...], ...}

if is_data_available:
    actors_map = unpickling('actors_map.pckl')
    actors_movies_map = unpickling('actors_movies_map.pckl')
else:
    for idx, movie in enumerate(movies_list):
        # Get the movies actors
        movie_actors = actors_data[idx]

        # Loop through the actors and add the movies sale to them
        for actor in movie_actors:
            #TODO: some movies have ',' in their name and they are not handled yet!
            try:
                actors_map[actor] += int(movie[3][1:]) # 1: to remove the $ sign
            except:
                actors_map[actor] += 0
            actors_movies_map[actor].append(movie[1])

    # Save the calculated actors_map and actors_movies_map to disk
    pickling("actors_map.pckl", actors_map)
    pickling("actors_movies_map.pckl", actors_movies_map)

CPU times: total: 156 ms
Wall time: 157 ms


In [20]:
html = print_actors_map_pd(actors_map, actors_movies_map, limit=50, movies_limit=5)

Unnamed: 0,Actor Name,Profit,Movies
0,Samuel L. Jackson,"$16,338,817,882",The Avengers Avengers: Age of Ultron Incredibles 2 Spider-Man: Far from Home Captain Marvel
1,Robert Downey Jr.,"$13,952,302,902",Avengers: Endgame Avengers: Infinity War The Avengers Avengers: Age of Ultron Iron Man 3
2,Benedict Cumberbatch,"$13,508,189,548",Avengers: Endgame Avengers: Infinity War Spider-Man: No Way Home The Hobbit: The Battle of the Five Armies The Hobbit: The Desolation of Smaug
3,Scarlett Johansson,"$13,473,169,463",Avengers: Endgame Avengers: Infinity War The Avengers Avengers: Age of Ultron Captain America: Civil War
4,Alan Tudyk,"$12,357,426,928",Frozen II Frozen II Frozen Transformers: Dark of the Moon Rogue One: A Star Wars Story
5,Idris Elba,"$11,767,250,635",Avengers: Infinity War Avengers: Age of Ultron Finding Dory Zootopia The Jungle Book
6,Chris Hemsworth,"$11,741,117,528",Avengers: Endgame Avengers: Infinity War The Avengers Avengers: Age of Ultron Thor: Ragnarok
7,Zoe Saldaña,"$11,434,686,068",Avatar Avengers: Endgame Avengers: Infinity War Guardians of the Galaxy Vol. 2 Guardians of the Galaxy
8,Andy Serkis,"$11,305,428,948",Star Wars: Episode VII - The Force Awakens Black Panther Star Wars: Episode VIII - The Last Jedi The Lord of the Rings: The Return of the King The Lord of the Rings: The Two Towers
9,Chris Evans,"$11,179,294,976",Avengers: Endgame Avengers: Infinity War The Avengers Avengers: Age of Ultron Captain America: Civil War
