In [None]:
import requests
from bs4 import BeautifulSoup as BS

import pandas as pd
import numpy as np
import re
import pickle

#### Notes:
- The bulk of the data I want is available on the detailed search result view
- I used these criteria for the detailed search:
    - Release date between 1/1/92 and 12/31/22, divided into chunks so no one result set is more than 10,000 movies
    - MPAA Rating of G, PG, PG-13, or NC-17
    - Because of the NC-17 movies, adult content is included
    - Types included: Feature, TV Movie, Documentary, Short, and Video
- Not all movies have all the elements I'm trying to pull, so there are several elements that have if/else loops

In [None]:
def scrape_page(URL):
    
    soup = BS(requests.get(URL).text)
    
    # Get info about number of entries on current page
    page_desc = soup.find('div', attrs={'class' : 'desc'}).text.strip()
    
    # Pull out pertinent info
    first_entry = int(re.search('(\d+,?\d*)-', page_desc)[1].replace(',', ''))
    last_entry = int(re.search('-(\d+,?\d*) ', page_desc)[1].replace(',', ''))
    entries_on_page = last_entry - (first_entry - 1)
    
    # Get IMDB ids
    ids = [x.get('data-tconst') for x in soup.find_all('div', attrs={'class' : 'ribbonize'})]
    
    # Get movie titles
    titles = [x.find('a').text for x in soup.find_all('h3', attrs={'class' : 'lister-item-header'})]
    
    # Get MPAA rating
    mpaas = []
    
    for i in range (0, entries_on_page):
        if soup.find_all('div', attrs = {'class' : 'lister-item-content'})[i].find('span', attrs = {'class' : 'certificate'}):
            mpaa = soup.find_all('div', attrs = {'class' : 'lister-item-content'})[i].find('span', attrs = {'class' : 'certificate'}).text
        else:
            mpaa = ''
        mpaas.append(mpaa)
    
    # Get movie runtimes
    runtimes = []
    
    for i in range (0, entries_on_page):
        if soup.find_all('div', attrs = {'class' : 'lister-item-content'})[i].find('span', attrs = {'class' : 'runtime'}):
            rt = soup.find_all('div', attrs = {'class' : 'lister-item-content'})[i].find('span', attrs = {'class' : 'runtime'}).text
        else:
            rt = ''
        runtimes.append(rt)
    
    # Get movie genres
    genres = []
    
    for i in range (0, entries_on_page):
        if soup.find_all('div', attrs = {'class' : 'lister-item-content'})[i].find('span', attrs = {'class' : 'genre'}):
            gens = soup.find_all('div', attrs = {'class' : 'lister-item-content'})[i].find('span', attrs = {'class' : 'genre'}).text.strip()
        else:
            gens = ''
        genres.append(gens)
    
    # Get movie release years
    years = [x.text.strip('()') for x in soup.find_all('span', attrs={'class' : 'lister-item-year text-muted unbold'})]
    
    # Get IMDB's ratings
    ratings = []
    
    for i in range (0, entries_on_page):
        if soup.find_all('div', attrs = {'class' : 'lister-item-content'})[i].find('div', attrs={'class' : 'inline-block ratings-imdb-rating'}):
            rat = (
                soup.find_all('div', attrs = {'class' : 'lister-item-content'})[i]
                .find('div', attrs={'class' : 'inline-block ratings-imdb-rating'})
                .text.strip().split('\n| ')
            )
        else:
            rat = ''
        ratings.append(rat)
    
    # Get # votes and US+Canada gross revenue
    votes_and_gross = []
    
    for i in range (0, entries_on_page):
        if soup.find_all('div', attrs = {'class' : 'lister-item-content'})[i].find('p', attrs = {'class' : 'sort-num_votes-visible'}):
            v_and_g = (
                soup.find_all('div', attrs = {'class' : 'lister-item-content'})[i]
                .find('p', attrs = {'class' : 'sort-num_votes-visible'})
                .text.strip().split('\n| ')
            )
        else:
            v_and_g = ['', '']
        votes_and_gross.append(v_and_g)
    
    # If a movie has a metascore, capture it, if not, put in a placeholder
    metas = []

    for i in range (0, entries_on_page):
        if soup.find_all('div', attrs = {'class' : 'lister-item-content'})[i].find('div', attrs = {'class' : 'inline-block ratings-metascore'}):
            meta = (
                soup.find_all('div', attrs = {'class' : 'lister-item-content'})[i]
                .find('div', attrs = {'class' : 'inline-block ratings-metascore'})
                .find('span').text.strip()
            )
        else:
            meta = ''
        metas.append(meta)
    
    # Append results
    imdb_ids.extend(ids)
    imdb_titles.extend(titles)
    imdb_mpaas.extend(mpaas)
    imdb_runtimes.extend(runtimes)
    imdb_genres.extend(genres)
    release_years.extend(years)
    imdb_ratings.extend(ratings)
    imdb_votes_and_gross.extend(votes_and_gross)
    metascores.extend(metas)

In [None]:
imdb_ids = []
imdb_titles = []
imdb_mpaas = []
imdb_runtimes = []
imdb_genres = []
release_years = []
imdb_ratings = []
imdb_votes_and_gross = []
metascores = []

In [None]:
URL = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie,documentary,short,video&release_date=1992-01-01,2001-12-31&certificates=US%3AG,US%3APG,US%3APG-13,US%3AR,US%3ANC-17&adult=include&count=250'
soup = BS(requests.get(URL).text)
total_entries = int(re.search('of (\d+,?\d*) ', soup.find('div', attrs={'class' : 'desc'}).text.strip())[1].replace(',', ''))
pages = int(total_entries/250)+1

for i in range (0, pages):
    print(f'page {i} of {pages}')
    start_entry = 250 * i + 1
    url = f'https://www.imdb.com/search/title/?title_type=feature,tv_movie,documentary,short,video&release_date=1992-01-01,2001-12-31&certificates=US%3AG,US%3APG,US%3APG-13,US%3AR,US%3ANC-17&adult=include&count=250&start={start_entry}'
    scrape_page(url)

In [None]:
first_chunk = pd.DataFrame(
    {'imdb_ids' : imdb_ids,
     'imdb_titles' : imdb_titles,
     'imdb_mpaas' : imdb_mpaas,
     'imdb_runtimes' : imdb_runtimes,
     'imdb_genres' : imdb_genres,
     'release_years' : release_years,
     'imdb_ratings' : imdb_ratings,
     'imdb_votes_and_gross' : imdb_votes_and_gross,
     'metascores' : metascores}

)

In [None]:
first_chunk

In [None]:
first_chunk.to_pickle('../data/imdb1992-2001.pkl')

In [None]:
imdb_ids = []
imdb_titles = []
imdb_mpaas = []
imdb_runtimes = []
imdb_genres = []
release_years = []
imdb_ratings = []
imdb_votes_and_gross = []
metascores = []

In [None]:
URL = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie,documentary,short,video&release_date=2002-01-01,2009-12-31&certificates=US%3AG,US%3APG,US%3APG-13,US%3AR,US%3ANC-17&adult=include&count=250'
soup = BS(requests.get(URL).text)
total_entries = int(re.search('of (\d+,?\d*) ', soup.find('div', attrs={'class' : 'desc'}).text.strip())[1].replace(',', ''))
pages = int(total_entries/250)+1

for i in range (0, pages):
    current_page = i + 1
    print(f'page {current_page} of {pages}')
    start_entry = 250 * i + 1
    url = f'https://www.imdb.com/search/title/?title_type=feature,tv_movie,documentary,short,video&release_date=2002-01-01,2009-12-31&certificates=US%3AG,US%3APG,US%3APG-13,US%3AR,US%3ANC-17&adult=include&count=250&start={start_entry}'
    scrape_page(url)

In [None]:
second_chunk = pd.DataFrame(
    {'imdb_ids' : imdb_ids,
     'imdb_titles' : imdb_titles,
     'imdb_mpaas' : imdb_mpaas,
     'imdb_runtimes' : imdb_runtimes,
     'imdb_genres' : imdb_genres,
     'release_years' : release_years,
     'imdb_ratings' : imdb_ratings,
     'imdb_votes_and_gross' : imdb_votes_and_gross,
     'metascores' : metascores}

)

In [None]:
second_chunk.to_pickle('../data/imdb2002-2009.pkl')

In [None]:
imdb_ids = []
imdb_titles = []
imdb_mpaas = []
imdb_runtimes = []
imdb_genres = []
release_years = []
imdb_ratings = []
imdb_votes_and_gross = []
metascores = []

In [None]:
URL = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie,documentary,short,video&release_date=2010-01-01,2018-12-31&certificates=US%3AG,US%3APG,US%3APG-13,US%3AR,US%3ANC-17&adult=include&count=250'
soup = BS(requests.get(URL).text)
total_entries = int(re.search('of (\d+,?\d*) ', soup.find('div', attrs={'class' : 'desc'}).text.strip())[1].replace(',', ''))
pages = int(total_entries/250)+1

for i in range (0, pages):
    current_page = i + 1
    print(f'page {current_page} of {pages}')
    start_entry = 250 * i + 1
    url = f'https://www.imdb.com/search/title/?title_type=feature,tv_movie,documentary,short,video&release_date=2010-01-01,2018-12-31&certificates=US%3AG,US%3APG,US%3APG-13,US%3AR,US%3ANC-17&adult=include&count=250&start={start_entry}'
    scrape_page(url)

In [None]:
third_chunk = pd.DataFrame(
    {'imdb_ids' : imdb_ids,
     'imdb_titles' : imdb_titles,
     'imdb_mpaas' : imdb_mpaas,
     'imdb_runtimes' : imdb_runtimes,
     'imdb_genres' : imdb_genres,
     'release_years' : release_years,
     'imdb_ratings' : imdb_ratings,
     'imdb_votes_and_gross' : imdb_votes_and_gross,
     'metascores' : metascores}
)

In [None]:
third_chunk.to_pickle('../data/imdb2010-2018.pkl')

In [None]:
imdb_ids = []
imdb_titles = []
imdb_mpaas = []
imdb_runtimes = []
imdb_genres = []
release_years = []
imdb_ratings = []
imdb_votes_and_gross = []
metascores = []

In [None]:
URL = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie,documentary,short,video&release_date=2019-01-01,2022-12-31&certificates=US%3AG,US%3APG,US%3APG-13,US%3AR,US%3ANC-17&adult=include&count=250'
soup = BS(requests.get(URL).text)
total_entries = int(re.search('of (\d+,?\d*) ', soup.find('div', attrs={'class' : 'desc'}).text.strip())[1].replace(',', ''))
pages = int(total_entries/250)+1

for i in range (0, pages):
    current_page = i + 1
    print(f'page {current_page} of {pages}')
    start_entry = 250 * i + 1
    url = f'https://www.imdb.com/search/title/?title_type=feature,tv_movie,documentary,short,video&release_date=2019-01-01,2022-12-31&certificates=US%3AG,US%3APG,US%3APG-13,US%3AR,US%3ANC-17&adult=include&count=250&start={start_entry}'
    scrape_page(url)

In [None]:
fourth_chunk = pd.DataFrame(
    {'imdb_ids' : imdb_ids,
     'imdb_titles' : imdb_titles,
     'imdb_mpaas' : imdb_mpaas,
     'imdb_runtimes' : imdb_runtimes,
     'imdb_genres' : imdb_genres,
     'release_years' : release_years,
     'imdb_ratings' : imdb_ratings,
     'imdb_votes_and_gross' : imdb_votes_and_gross,
     'metascores' : metascores}
)

In [None]:
fourth_chunk.to_pickle('../data/imdb2019-2022.pkl')

In [None]:
imdb_ids = []
imdb_titles = []
imdb_mpaas = []
imdb_runtimes = []
imdb_genres = []
release_years = []
imdb_ratings = []
imdb_votes_and_gross = []
metascores = []

In [None]:
URL = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie,documentary,short,video&release_date=1991-01-01,1991-12-31&certificates=US%3AG,US%3APG,US%3APG-13,US%3AR,US%3ANC-17&adult=include&count=250'
soup = BS(requests.get(URL).text)
total_entries = int(re.search('of (\d+,?\d*) ', soup.find('div', attrs={'class' : 'desc'}).text.strip())[1].replace(',', ''))
pages = int(total_entries/250)+1

for i in range (0, pages):
    current_page = i + 1
    print(f'page {current_page} of {pages}')
    start_entry = 250 * i + 1
    url = f'https://www.imdb.com/search/title/?title_type=feature,tv_movie,documentary,short,video&release_date=1991-01-01,1991-12-31&certificates=US%3AG,US%3APG,US%3APG-13,US%3AR,US%3ANC-17&adult=include&count=250&start={start_entry}'
    scrape_page(url)

In [None]:
additional_chunk_a = pd.DataFrame(
    {'imdb_ids' : imdb_ids,
     'imdb_titles' : imdb_titles,
     'imdb_mpaas' : imdb_mpaas,
     'imdb_runtimes' : imdb_runtimes,
     'imdb_genres' : imdb_genres,
     'release_years' : release_years,
     'imdb_ratings' : imdb_ratings,
     'imdb_votes_and_gross' : imdb_votes_and_gross,
     'metascores' : metascores}
)

In [None]:
additional_chunk_a

In [None]:
additional_chunk_a.to_pickle('../data/imdb1991.pkl')