In [225]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time, os
from random import randint
import pickle

In [226]:
#read in top movie list dataframe
movie_df = pd.read_pickle('top_movies_df.pickle')

In [103]:
movie_df.head()

Unnamed: 0,URL,Rank,Title,World_Gross,Domestic_Gross,Domestic_Percentage,Foreign_Gross,Foreign_Percentage,Year
4154796,/title/tt4154796/?ref_=bo_cso_table_1,1,Avengers: Endgame,"$2,797,800,564","$858,373,000",30.7%,"$1,939,427,564",69.3%,2019
499549,/title/tt0499549/?ref_=bo_cso_table_2,2,Avatar,"$2,790,439,092","$760,507,625",27.2%,"$2,029,931,467",72.8%,2009
120338,/title/tt0120338/?ref_=bo_cso_table_3,3,Titanic,"$2,195,169,869","$659,363,944",30%,"$1,535,805,925",70%,1997
2488496,/title/tt2488496/?ref_=bo_cso_table_4,4,Star Wars: Episode VII - The Force Awakens,"$2,068,224,036","$936,662,225",45.3%,"$1,131,561,811",54.7%,2015
4154756,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$2,048,359,754","$678,815,482",33.1%,"$1,369,544,272",66.9%,2018


In [227]:
#turn index into list
links = movie_df.index.tolist()

In [107]:
links

['4154796',
 '0499549',
 '0120338',
 '2488496',
 '4154756',
 '0369610',
 '6105098',
 '0848228',
 '2820852',
 '4520988',
 '2395427',
 '1825683',
 '1201607',
 '2527336',
 '4881806',
 '2294629',
 '2771200',
 '3606756',
 '4630562',
 '1300854',
 '2293640',
 '3498820',
 '1477834',
 '0167260',
 '6320628',
 '4154664',
 '1399103',
 '1074638',
 '2109248',
 '1345836',
 '7286456',
 '2527338',
 '1979376',
 '0435761',
 '0383574',
 '3748528',
 '6139732',
 '1298650',
 '3469046',
 '0107290',
 '2277860',
 '0120915',
 '0109830',
 '1014759',
 '2948356',
 '0903624',
 '0468569',
 '0241527',
 '0926084',
 '1690953',
 '0110357',
 '3040964',
 '2283362',
 '0449088',
 '1170358',
 '2310332',
 '0167261',
 '0373889',
 '0266543',
 '0417741',
 '0298148',
 '1727824',
 '0330373',
 '0413300',
 '0120737',
 '1080016',
 '2379713',
 '2250912',
 '0295297',
 '1667889',
 '2709768',
 '2975590',
 '7131870',
 '0121766',
 '1951264',
 '3896198',
 '2096673',
 '1270797',
 '3501632',
 '1375666',
 '1055369',
 '1673434',
 '0145487',
 '04

In [228]:
def mojo_movie_dict(url_id):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic opening gross
        - MPAA rating 
        - foreign market count
        - genres
        - distributor
        - director 
        - actor, first billing
        - actor, second billing
        - IMDB link
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com/title/tt'
    
    url = base_url + url_id
        
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['id', 'mojo_title', 'domestic_opening_gross',
                'rating', 'foreign_market_count', 'genres', 'distributor',
                'director', 'actor_1', 'actor_2', 'IMDB_link']
    
    #Get title
    title_string = soup.find(class_= 'a-size-extra-large').text
    mojo_title = title_string.split('(')[0].strip()


    #Get distribution company
    distributor= soup.find(class_='a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile').find_all('span')[1].contents[0] 

    #Get domestic opening
    try:
        domestic_opening = soup.find(text='Domestic Opening').findNext().find(class_= 'money').text
    except:
        domestic_opening = np.nan

    #Get rating
    try:
        rating = soup.find(text='MPAA').findNext().text

    except:
        rating = np.nan 

    #find foreign markets, depending on format
    try: 
        if soup.find(text = 'Markets'):
            foreign_markets = soup.find('table', class_= 'a-bordered a-horizontal-stripes a-size-base-plus').find_all(class_='a-link-normal')[1].text[0:2]

        else:
            market_links = soup.find(class_='a-section mojo-h-scroll').find_all(class_= 'a-link-normal')
            foreign_markets = len(market_links)-1
    except:
        foreign_markets = np.nan

    #find genres
    try:
        genres = soup.find(text='Genres').findNext().text
    except:
        genres = np.nan

    #find IMDB movie page link and strip off reference parts of URL (everything after ?)
    try:
        link_string = soup.find(class_='a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile').find_all('a', class_='a-link-normal')[-1]['href']
        link_string = link_string.split('?')[0].strip()
    except:
        link_string = np.nan

    #create url to scrape credits page
    credits_url = url+'/credits'

    #Request HTML and parse
    response = requests.get(credits_url)
    page = response.text
    credits_soup = BeautifulSoup(page,"lxml")

    #find director(first)

    director = credits_soup.find(id = 'principalCrew').find_all('tr')[1].text.split("\n",1)[0]

    #find principal actor
    actor_1 = credits_soup.find(id = 'principalCast').find_all('tr')[1].text.split("\n",1)[0]

    #find second actor
    actor_2 = actor_2 = credits_soup.find(id = 'principalCast').find_all('tr')[2].text.split("\n",1)[0]

    #Create movie dictionary and return
    mojo_dict = dict(zip(headers, [url_id, 
                                mojo_title,
                                domestic_opening,
                                rating, 
                                foreign_markets,
                                genres,
                                distributor,
                                director,
                                actor_1,
                                actor_2,
                                link_string]))


    return mojo_dict


In [214]:
#break url list into sections, due to errors with website when scraping as a whole.
first_100 = links[0:100]
len(first_100)

100

In [None]:
second_100 = links[100:200]
len(second_100)

In [134]:
third_100 = links[200:300]
len(third_100)

100

In [141]:
fourth_100 = links[300:400]

In [148]:
fifth_100 = links[400:500]

In [229]:
sixth_100 = links[500:600]


In [230]:
#cleaning out a page that can't be scraped and isn't a domestic production
#would've had to clean it out later anyway
sixth_100.remove('4276752')

In [231]:
len(sixth_100)

99

In [176]:
seventh_100 = links[600:700]
len(seventh_100)

100

In [178]:
eigth_100 = links[700:800]

In [185]:
ninth_100 = links[800:900]
len(ninth_100)

100

In [191]:
tenth_100 = links[900:1000]
len(tenth_100)

100

In [197]:
eleventh_100 = links[1000:1100]
len(eleventh_100)

100

In [203]:
twelfth_100 = links[1100:1200]
len(twelfth_100)

100

In [206]:
#cleaning out a page that can't be scraped.

twelfth_100.remove('0112715')

In [208]:
len(twelfth_100)

99

In [232]:
#creating a list to store dictionary outputs from scraper function
mojo_dict_list = []

In [233]:
#feeding partial link lists into function which takes in partial list, scrapes each url
#returns dictionary of values for each movie url page and appends it into existing list.
for link in sixth_100:
    mojo_dict_list.append(mojo_movie_dict(link))
    
    #random sleep counter 1-4 seconds to avoid being blocked while scraping
    time.sleep(randint(4,6))

In [234]:
mojo_dict_list

[{'id': '0433035',
  'mojo_title': 'Real Steel',
  'domestic_opening_gross': '$27,319,677',
  'rating': 'PG-13',
  'foreign_market_count': 51,
  'genres': 'Action\n    \n        Drama\n    \n        Sci-Fi\n    \n        Sport',
  'distributor': 'Walt Disney Studios Motion Pictures',
  'director': 'Shawn Levy',
  'actor_1': 'Hugh Jackman',
  'actor_2': 'Evangeline Lilly',
  'IMDB_link': 'https://pro.imdb.com/title/tt0433035'},
 {'id': '0451079',
  'mojo_title': 'Horton Hears a Who!',
  'domestic_opening_gross': '$45,012,998',
  'rating': nan,
  'foreign_market_count': 74,
  'genres': 'Adventure\n    \n        Animation\n    \n        Comedy\n    \n        Family\n    \n        Fantasy',
  'distributor': 'Twentieth Century Fox',
  'director': 'Jimmy Hayward',
  'actor_1': 'Jim Carrey',
  'actor_2': 'Steve Carell',
  'IMDB_link': 'https://pro.imdb.com/title/tt0451079'},
 {'id': '5164214',
  'mojo_title': "Ocean's 8",
  'domestic_opening_gross': '$41,607,378',
  'rating': 'PG-13',
  'fore

In [235]:
#turn list of dictionaries into dataframe. 
mojo_df = pd.DataFrame(mojo_dict_list)  #convert list of dict to df
mojo_df.set_index('id', inplace=True)

mojo_df.shape

(99, 10)

In [236]:
#increase recursion limit
import sys
sys.setrecursionlimit(10000)

In [237]:
#pickle dataframe of each partial url batch. 
#unable to pickle and open pickled dictionaries currently due to UTF 800 error, so df it is.
mojo_df.to_pickle('mojo_df_600.pickle')