In [9]:
from bs4 import BeautifulSoup
import requests

In [18]:
#need to pass it a list of urls

def mojo_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - distributor
        - domestic opening $ 
        - MPAA rating
        - number of foreign markets released
        - IMDB link
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com/title/tt'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['mojo_title', 'distributor', 'domestic_opening',
               'rating', 'markets', 'IMDB_link']
    
    #Get title
    title_string = soup.find(class_= 'a-size-extra-large').text
    mojo_title = title_string.split('(')[0].strip()

    #Get distribution company
    distributor= soup.find(class_='a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile').find_all('span')[1].contents[0] 
    
    #Get domestic opening
    domestic_opening = soup.find(text='Domestic Opening').findNext().find(class_= 'money').text
    
    #Get rating
    rating = soup.find(text='MPAA').findNext().text
    
    #find foreign markets, depending on format
    
    if soup.find(text = 'Markets'):
        foreign_markets = soup.find('table', class_= 'a-bordered a-horizontal-stripes a-size-base-plus').find_all(class_='a-link-normal')[1].text[0:2]
    
    else:
        market_links = soup.find(class_='a-section mojo-h-scroll').find_all(class_= 'a-link-normal')
        foreign_markets = len(market_links)-1
    
    #find IMDB movie page link and strip off reference parts of URL (everything after ?)
    link_string = soup.find(class_='a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile').find_all('a', class_='a-link-normal')[-1]['href']
    link_string = link_string.split('?')[0].strip()
    
    
    #Create movie dictionary and return
    mojo_dict = dict(zip(headers, [mojo_title,
                                distributor,
                                domestic_opening,
                                rating, 
                                foreign_markets, 
                                link_string]))

    return mojo_dict

In [19]:
test_link = '0295297'
mojo_movie_dict(test_link)

{'mojo_title': 'Harry Potter and the Chamber of Secrets',
 'distributor': 'Warner Bros.',
 'domestic_opening': '$88,357,488',
 'rating': 'PG',
 'markets': '30',
 'IMDB_link': 'https://pro.imdb.com/title/tt0295297'}

In [20]:
lord_of_rings = '0120737'
mojo_movie_dict(lord_of_rings)

{'mojo_title': 'The Lord of the Rings: The Fellowship of the Ring',
 'distributor': 'New Line Cinema',
 'domestic_opening': '$47,211,490',
 'rating': 'PG-13',
 'markets': '65',
 'IMDB_link': 'https://pro.imdb.com/title/tt0120737'}