In [14]:
import scrapy
from scrapy.crawler import CrawlerProcess
import re

name_id_list = {"imdb_ids": [], "names": []}

# Create a list of URLs
actor_list_urls = []
for i in range(1, 9):
    url = f'https://www.imdb.com/list/ls006050174/?sort=list_order,asc&mode=detail&page={i}' # get url which displays list of actor names
    actor_list_urls.append(url)

for i in range(1, 16):
    url = f'https://www.imdb.com/list/ls066061932/?sort=list_order,asc&mode=detail&page={i}' 
    actor_list_urls.append(url)

# Create the Spider class
class SCRAPEactors(scrapy.Spider):
    name = 'actorscraper'

  # start_requests method
    def start_requests(self):
        for url in actor_list_urls:
            yield scrapy.Request(url=url, callback=self.parse)
  
  # parse method
    def parse(self, response):
    # Extract actor IMDb IDs and names
        act_links = response.xpath('//h3[@class = "lister-item-header"]/a/@href') # Create a SelectorList of the IMDb html links (which contain the IDs)
        for link in act_links.extract():
            name_id_list["imdb_ids"].append(re.sub("(/name/)", "", link)) # extract, clean, and append IDs

        act_names = response.xpath('//h3[@class = "lister-item-header"]/a/text()') # Create a SelectorList of the actor name headers (which is the actor name in text format)
        for name in act_names.extract():
            name_id_list["names"].append(name.strip(" \n")) # extract, clean, and append names

# process = CrawlerProcess()
# process.crawl(SCRAPEactors)
# process.start()


In [15]:
# check length of list (as of 06/06/2021 the total should be 2226)
print(f"There are {len(name_id_list['names'])} actors in the scraped list.")

# check both names and IDs have been cleaned correctly
print(name_id_list["imdb_ids"][:10])
print(name_id_list["names"][:10])

# check lengths of lists
assert len(name_id_list["imdb_ids"]) == len(name_id_list["names"])


There are 0 actors in the scraped list.
[]
[]


In [1]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
               

In [2]:
import json

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)
        

In [18]:
# Save actor name and id dictionary as json
save_data("name_id_list.json", name_id_list) 

In [3]:
### Define the TMDB and OMDB data retrieval functions ### 

import urllib 
import requests
import os
import json

tmdb_api_key = '78d8d7264a394221f7bc34cb8a55a404'
omdb_apikey = '499f9e4b'

def find_actor(imdb_actor_id):
    """Find an actor's record in TMDb using their IMDb ID and extract."""  
    base_url = f'https://api.themoviedb.org/3/find/{imdb_actor_id}'
    params = {"api_key": os.environ['TMDB_API_KEY'], "external_source": "imdb_id"} 
    response = requests.get(base_url, params=params) 
    data = response.json() 
    actor_info = data["person_results"] 
    return actor_info

def get_tmdb_actor_id(actor_info):
    """Extract an actor's TMDb ID from an actor_info JSON object."""
    tmdb_id = actor_info[0].get("id")
    return tmdb_id

def get_actor_gender(actor_info):
    """Extract an actor's gender from an actor_info JSON object."""
    actor_gender = actor_info[0].get("gender")
    return actor_gender

def get_actor_birthday(tmdb_actor_id):
    """Extract an actor's birthday from an actor_info JSON object."""
    base_url = f'https://api.themoviedb.org/3/person/{tmdb_actor_id}'
    params = {"api_key": os.environ['TMDB_API_KEY']} 
    response = requests.get(base_url, params=params) 
    data = response.json() 
    actor_birthday = data.get("birthday")
    return actor_birthday

def get_movie_credits(tmdb_actor_id):
    """Get an actor's movie credits using their TMDb ID."""   
    base_url = f'https://api.themoviedb.org/3/person/{tmdb_actor_id}/movie_credits'  
    params = {"api_key": os.environ['TMDB_API_KEY']}  
    response = requests.get(base_url, params=params) 
    movie_credits = response.json() 
    return movie_credits

def get_movie_titles(movie_credits):
    """Extract the movie titles from a movie_credits JSON object."""
    movie_titles = []
    for credit in movie_credits["cast"]:
        title = credit.get("original_title")
        movie_titles.append(title)
    assert len(movie_credits["cast"]) == len(movie_titles) 
    return movie_titles

def get_tmdb_movie_ids_from_credits(movie_credits):
    """Extract the tmdb movie IDs from movie_credits JSON."""
    tmdb_movie_ids = []
    for credit in movie_credits["cast"]:
        tmdb_movie_id = credit.get("id")
        tmdb_movie_ids.append(tmdb_movie_id)
    assert len(movie_credits["cast"]) == len(tmdb_movie_ids) 
    return tmdb_movie_ids

def get_imdb_movie_id(tmdb_movie_id):
    """Get the external ids for a movie and extract the IMDb id."""
    base_url = f'https://api.themoviedb.org/3/movie/{tmdb_movie_id}/external_ids'
    params = {"api_key": os.environ["TMDB_API_KEY"]}
    response = requests.get(base_url, params=params)
    id_data = response.json()
    imdb_id = id_data.get("imdb_id")
    return imdb_id

def get_release_dates(tmdb_movie_id):
    """Get the release dates for a movie.""" 
    base_url = f'https://api.themoviedb.org/3/movie/{tmdb_movie_id}/release_dates'
    params = {"api_key": os.environ["TMDB_API_KEY"]}
    response = requests.get(base_url, params=params)
    data = response.json()
    release_dates = data.get("results")
    return release_dates

def get_cast_and_crew(tmdb_movie_id):
    """Get the cast and crew for a movie."""
    base_url = f'https://api.themoviedb.org/3/movie/{tmdb_movie_id}/credits'
    params = {"api_key": os.environ["TMDB_API_KEY"]}
    response = requests.get(base_url, params=params)
    data = response.json()
    cast = data.get("cast")
    crew = data.get("crew")
    return (cast, crew)

def get_movie_budget(tmdb_movie_id):
    """Get the budget for a movie."""
    base_url = f'https://api.themoviedb.org/3/movie/{tmdb_movie_id}'
    params = {"api_key": os.environ["TMDB_API_KEY"]}
    response = requests.get(base_url, params=params)
    data = response.json()
    budget = data.get("budget")
    return budget

def get_movie_keywords(tmdb_movie_id):
    """Get the keywords that have been added to a movie."""
    base_url = f'https://api.themoviedb.org/3/movie/{tmdb_movie_id}/keywords'
    params = {"api_key": os.environ["TMDB_API_KEY"]}
    response = requests.get(base_url, params=params)
    data = response.json()
    keywords = data.get("keywords")
    return keywords

def get_alt_movie_titles(tmdb_movie_id):
    """Get all of the alternative titles for a movie."""
    base_url = f'https://api.themoviedb.org/3/movie/{tmdb_movie_id}/alternative_titles'
    params = {"api_key": os.environ["TMDB_API_KEY"]}
    response = requests.get(base_url, params=params)
    title_data = response.json()
    alt_titles = []
    if title_data.get("titles") is not None: 
        for title_dict in title_data.get("titles"):
            alt_title = title_dict.get("title")
            alt_titles.append(alt_title)
    return alt_titles

def get_omdb_movie_info_by_id(imdb_movie_id):
    """Get movie information from OMDb's Web API."""
    base_url = 'http://www.omdbapi.com/?' 
    params = {'apikey': os.environ["OMDB_API_KEY"], 'i': imdb_movie_id, 'type': 'movie', 'plot': 'full'}
    response = requests.get(base_url, params=params) 
    movie_info = response.json() 
    return movie_info

def get_omdb_movie_info_by_title(title):
    """Get movie information from OMDb's Web API."""
    base_url = 'http://www.omdbapi.com/?' 
    params = {'apikey': os.environ["OMDB_API_KEY"], 't': title, 'type': 'movie', 'plot': 'full'}
    response = requests.get(base_url, params=params) 
    movie_info = response.json() 
    return movie_info

def search_for_movie_get_tmdb_id(title):
    """Search for a movie in TMDb's database and return the TMDb_id."""
    base_url = f'https://api.themoviedb.org/3/search/movie'
    params = {"api_key": os.environ["TMDB_API_KEY"], "query": title}
    response = requests.get(base_url, params=params)
    data = response.json()
    if data.get("results") is not None:
        for movie in data.get("results"):
            if movie.get("title").lower() == title.lower():
                tmdb_id = movie.get("id")
                return tmdb_id

In [None]:
### Check to make sure no data is being lost in the sequence of function calls. 

credits = get_movie_credits(2395)

display(len(credits["cast"]))

tmdb_ids = get_tmdb_movie_ids(credits)

imdb_ids = []
for tmdb_id in tmdb_ids:
    imdb_id = get_imdb_movie_id(tmdb_id)
    imdb_ids.append(imdb_id)

display(len(imdb_ids))

omdb_data = []
for imdb_id in imdb_ids:
    movie_data = get_omdb_movie_info(imdb_id)
    omdb_data.append(movie_data)

display(len(omdb_data))

In [None]:
### Retrieve actor and movie information from TMDb ###

# Load name_id data
name_id_list = load_data("name_id_list.json")

# Initialise the empty lists to store actor and movie data.
actor_data_list = []
tmdb_movie_data_list = []

# Initialise an extra list to store the actor IDs that returned no data.
actor_not_found_list = []

# Query TMBb database using the defined functions and store the returned data in lists of dictionaries.
for imdb_actor_id in name_id_list["imdb_ids"]:

    if len(find_actor(imdb_actor_id)) != 0:
        tmdb_actor_id = get_tmdb_actor_id(find_actor(imdb_actor_id))
        gender = get_actor_gender(find_actor(imdb_actor_id))
        birthday = get_actor_birthday(tmdb_actor_id)
        movie_credits = get_movie_credits(tmdb_actor_id)
        movie_titles = get_movie_titles(movie_credits)
        tmdb_movie_ids = get_tmdb_movie_ids_from_credits(movie_credits)
        actor_data = dict(IMDb_ID = imdb_actor_id, TMDb_ID = tmdb_actor_id, 
                          Gender = gender, Birthday = birthday, 
                          Movie_Credits = movie_titles)
        actor_data_list.append(actor_data)

        for tmdb_movie_id in tmdb_movie_ids:
            imdb_movie_id = get_imdb_movie_id(tmdb_movie_id)
            alt_titles = get_alt_movie_titles(tmdb_movie_id)
            release_dates = get_release_dates(tmdb_movie_id)
            movie_keywords = get_movie_keywords(tmdb_movie_id)
            tmdb_movie_data = dict(IMDb_ID = imdb_movie_id, TMDb_ID = tmdb_movie_id, 
                                   Alternative_Titles = alt_titles, Release_Dates = release_dates, 
                                   Keywords = movie_keywords)

            if tmdb_movie_data not in tmdb_movie_data_list:
                tmdb_movie_data_list.append(tmdb_movie_data)
                
    else:
        actor_not_found_list.append(imdb_actor_id)


Wall time: 0 ns


In [None]:
# Save the retrieved TMDb data.
save_data("actor_data_list.json", actor_data_list)
save_data("tmdb_movie_data_list.json", tmdb_movie_data_list)
save_data("actor_not_found_list.json", actor_not_found_list)

In [None]:
# Confirm that the actor data retrieval was successful i.e. all 2226 actors were queried
print(len(actor_data_list))

# Check how many searches returned no data.
print(len(actor_not_found_list))
1
### From looking at the no actor data list and searching for a few of the actors on IMDb, the reason for TMDb returning no data is likely due to the actor being low profile and having no major/verified movie credits i.e. the movie credits they do have are for secondary supporting roles. If my data collection is accurate, all the black movies that low profile actors had supporting roles in should be present in the movie_data_list (as a result of querying by the higher profile lead actors).

2226
517


In [None]:
### Retrieve additional information from TMDb

tmdb_movie_data_list = load_data("tmdb_movie_data_list.json")

budget_data_list = []
cast_crew_data_list = [] 

for movie in tmdb_movie_data_list:
    tmdb_movie_id = movie["TMDb_ID"]
    budget = get_movie_budget(tmdb_movie_id)
    cast_and_crew = get_cast_and_crew(tmdb_movie_id)
    cast = cast_and_crew[0]
    crew = cast_and_crew[1]
    budget_data = dict(TMDb_ID = tmdb_movie_id, Budget = budget)
    cast_crew_data = dict(TMDb_ID = tmdb_movie_id, Cast = cast, Crew = crew)
    budget_data_list.append(budget_data)
    cast_crew_data_list.append(cast_crew_data)

In [4]:
### Retrieve additional information from TMDb for list 2.

tmdb_movie_data_list = load_data("tmdb_movie_data_list_2.json")

budget_data_list_2 = []
cast_crew_data_list_2 = [] 

for movie in tmdb_movie_data_list:
    tmdb_movie_id = movie["TMDb_ID"]
    budget = get_movie_budget(tmdb_movie_id)
    cast_and_crew = get_cast_and_crew(tmdb_movie_id)
    cast = cast_and_crew[0]
    crew = cast_and_crew[1]
    budget_data = dict(TMDb_ID = tmdb_movie_id, Budget = budget)
    cast_crew_data = dict(TMDb_ID = tmdb_movie_id, Cast = cast, Crew = crew)
    budget_data_list_2.append(budget_data)
    cast_crew_data_list_2.append(cast_crew_data)

In [None]:
### Save budget and cast and crew data

save_data("budget_data_list.json", budget_data_list)
save_data("cast_crew_data_list.json", cast_crew_data_list)

In [5]:
### Save budget and cast and crew data

save_data("budget_data_list_2.json", budget_data_list_2)
save_data("cast_crew_data_list_2.json", cast_crew_data_list_2)

In [None]:
### Retrieve actor and movie information from OMDb ###

# Load TMDb movie data
tmdb_movie_data_list = load_data("tmdb_movie_data_list.json")

# Initialise the empty list to store movie data retrieved by imdb id.
omdb_movie_data_list = []

# Query OMBb database using the defined function and store the returned data.
for movie_data in tmdb_movie_data_list:
    imdb_id = movie_data["IMDb_ID"]
    omdb_movie_info = get_omdb_movie_info_by_id(imdb_id)
    omdb_movie_data_list.append(omdb_movie_info)

In [22]:
import pandas as pd

# Load actor data list
actor_data_list = load_data("actor_data_list.json")
actor_data = pd.DataFrame(actor_data_list)

# Initialise a second empty list to store movie data retrieved by title.
omdb_movie_data_list_2 = []

# Query OMBb database using the defined function and store the returned data.
for title in actor_data["Movie_Credits"].sum():
    omdb_movie_info = get_omdb_movie_info_by_title(title)
    omdb_movie_data_list_2.append(omdb_movie_info)

In [None]:
# Save the retrieved OMDb data.
save_data("omdb_movie_data_list.json", omdb_movie_data_list)

In [23]:
# Save take 2 data.
save_data("omdb_movie_data_list_2.json", omdb_movie_data_list_2)

In [4]:
import pandas as pd

# Compare the titles in both datasets and retrieve the differences.
omdb_movie_data_list_1 = pd.DataFrame(load_data("omdb_movie_data_list.json"))
title_list_1 = omdb_movie_data_list_1[["Title"]]

omdb_movie_data_list_2 = pd.DataFrame(load_data("omdb_movie_data_list_2.json"))
title_list_2 = omdb_movie_data_list_2[["Title"]]

title_list_3 = title_list_1.merge(title_list_2, indicator=True, how='outer').loc[lambda x : x['_merge'] != 'both']

# Get the rows that only appear in the larger dataframe. 
additional_titles = title_list_3[title_list_3["_merge"] == 'right_only']

# Drop the duplicates.
additional_titles = additional_titles.drop_duplicates()

# Reset index.
additional_titles.reset_index(inplace=True)


In [5]:
### Get additional TMDb data for the additional movies.

# Initialise tmdb_movie_data_list
tmdb_movie_data_list_2 = []

# Get tmdb_movie_ids.
for title in additional_titles["Title"]:
    tmdb_movie_id = search_for_movie_get_tmdb_id(title)
    imdb_movie_id = get_imdb_movie_id(tmdb_movie_id)
    alt_titles = get_alt_movie_titles(tmdb_movie_id)
    release_dates = get_release_dates(tmdb_movie_id)
    movie_keywords = get_movie_keywords(tmdb_movie_id)
    tmdb_movie_data = dict(IMDb_ID = imdb_movie_id, TMDb_ID = tmdb_movie_id, 
                            Alternative_Titles = alt_titles, Release_Dates = release_dates, 
                            Keywords = movie_keywords)
    tmdb_movie_data_list_2.append(tmdb_movie_data)

In [6]:
# Save the retrieved data.
save_data("tmdb_movie_data_list_2.json", tmdb_movie_data_list_2)

In [3]:
### Prepare the url lists to scrape IMDb pages (xpaths updated 22/08/2021)
import scrapy
from scrapy.crawler import CrawlerProcess
import re

# Pick the tmdb data list to use.
tmdb_movie_data_list = load_data("tmdb_movie_data_list_2.json")

imdb_summary_urls = []

for movie_data in tmdb_movie_data_list:
    if movie_data["IMDb_ID"] is not None:
        imdb_movie_id = movie_data["IMDb_ID"]
        summary_url = f'https://www.imdb.com/title/{imdb_movie_id}/'
        imdb_summary_urls.append(summary_url)

### Scrape the movie's IMDb summary page to retrieve the US Opening Weekend Gross and Worldwide Box Office Gross figures 

# initialise empty data list 
box_office_data_list_2 = []

# Create the Spider class
class SCRAPEmovie_summary(scrapy.Spider):
    name = 'movie_summary_scraper'

  # start_requests method
    def start_requests(self):
        for url in imdb_summary_urls:
            yield scrapy.Request(url=url, callback = self.parse)

  # parse method
    def parse(self, response):
    # Extract first billed cast and names
        imdb_movie_id = response.xpath('//meta[contains(@property, "pageConst")]/@content').extract_first()
        opening_wknd_gross = response.xpath('//li[contains(@data-testid, "weekend")]//span[contains(@class, "content-item")]/text()').extract_first()
        worldwide_gross = response.xpath('//li[contains(@data-testid, "worldwide")]//span[contains(@class, "content-item")]/text()').extract_first()
        box_office_data = dict(IMDb_ID = imdb_movie_id, Opening_Weekend_Gross = opening_wknd_gross, Worldwide_Gross = worldwide_gross)
        box_office_data_list_2.append(box_office_data)

process = CrawlerProcess()
process.crawl(SCRAPEmovie_summary)
process.start()

2021-08-22 16:56:53 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: scrapybot)
2021-08-22 16:56:53 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.8.4 (tags/v3.8.4:dfa645a, Jul 13 2020, 16:46:45) [MSC v.1924 64 bit (AMD64)], pyOpenSSL 20.0.1 (OpenSSL 1.1.1k  25 Mar 2021), cryptography 3.4.7, Platform Windows-10-10.0.19041-SP0
2021-08-22 16:56:53 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-08-22 16:56:53 [scrapy.crawler] INFO: Overridden settings:
{}
2021-08-22 16:56:53 [scrapy.extensions.telnet] INFO: Telnet Password: ac9136eef913b7c9
2021-08-22 16:56:53 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2021-08-22 16:56:54 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddl

In [None]:
### Save the scraped box office data.
save_data("box_office_data_list.json", box_office_data_list)

In [4]:
### Save the scraped box office data list 2.
save_data("box_office_data_list_2.json", box_office_data_list_2)

In [3]:
### Prepare the url lists to scrape IMDb pages
import scrapy
from scrapy.crawler import CrawlerProcess
import re

tmdb_movie_data_list = load_data("tmdb_movie_data_list_2.json")

imdb_soundtrack_urls = []

for movie_data in tmdb_movie_data_list:
    imdb_movie_id = movie_data["IMDb_ID"]
    soundtrack_url = f'https://www.imdb.com/title/{imdb_movie_id}/soundtrack' 
    imdb_soundtrack_urls.append(soundtrack_url)

### Scrape the movie's IMDb summary page to retrieve the US Opening Weekend Gross and Worldwide Box Office Gross figures 

soundtrack_credits_data_list_2 = []

# Create the Spider class
class SCRAPEsoundtrack_creds(scrapy.Spider):
    name = 'soundtrack_creds_scraper'
    
  # start_requests method
    def start_requests(self):
        for url in imdb_soundtrack_urls: 
            yield scrapy.Request(url=url, callback = self.parse)
                           
  # parse method
    def parse(self, response):
        imdb_movie_id = response.xpath('//meta[contains(@property, "pageId")]/@content').extract_first()
        writer_performer_id_links = response.xpath('//div[@id = "soundtracks_content"]//a/@href').extract() # Extract writer/performer IMDb ID links 
        writer_performer_names = response.xpath('//div[@id = "soundtracks_content"]//a/text()').extract() # Extract writer/performer names
        credits_data = dict(IMDb_ID = imdb_movie_id, Soundtrack_Credits = [])
        for link_and_text in list(zip(writer_performer_id_links, writer_performer_names)):
            writer_performer_id = re.sub("(/name/)", "", link_and_text[0]).strip("/")
            writer_performer_name = link_and_text[1].strip(" \n")
            credits_data["Soundtrack_Credits"].append((writer_performer_id, writer_performer_name))
        soundtrack_credits_data_list_2.append(credits_data)
        
process = CrawlerProcess()
process.crawl(SCRAPEsoundtrack_creds)
process.start()

2021-08-25 08:14:19 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: scrapybot)
2021-08-25 08:14:19 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.8.4 (tags/v3.8.4:dfa645a, Jul 13 2020, 16:46:45) [MSC v.1924 64 bit (AMD64)], pyOpenSSL 20.0.1 (OpenSSL 1.1.1k  25 Mar 2021), cryptography 3.4.7, Platform Windows-10-10.0.19041-SP0
2021-08-25 08:14:19 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-08-25 08:14:19 [scrapy.crawler] INFO: Overridden settings:
{}
2021-08-25 08:14:19 [scrapy.extensions.telnet] INFO: Telnet Password: 691424eba3ebd12d
2021-08-25 08:14:19 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2021-08-25 08:14:19 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddl

In [None]:
### Save the scraped soundtrack credits data.
save_data("soundtrack_credits_data_list.json", soundtrack_credits_data_list)

In [4]:
### Save the scraped soundtrack credits data.
save_data("soundtrack_credits_data_list_2.json", soundtrack_credits_data_list_2)