In [96]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import random

In [97]:
url = "https://www.boxofficemojo.com/release/rl2270790401/?ref_=bo_rs_table_1"
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page)

In [98]:
def get_movie_value(soup, field_name):

    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [99]:
#Get title from web page that we requested
title = soup.title
print(title.text)

Monsters, Inc. 2020 Re-release - Box Office Mojo


In [100]:

#Getting rows of table data - "tr" means table row
rows = soup.find_all("tr")
rows

[<tr><th class="a-text-left mojo-field-type-date_interval mojo-sort-column mojo-sortable-column a-nowrap"><a class="a-link-normal a-nowrap" href="?sort=date&amp;ref_=bo_rl__resort#table" title="Date"><span class="a-color-state">Date</span><span class="a-letter-space"></span><span class="icon aok-relative"><i class="a-icon a-icon-collapse" role="presentation"></i></span></a></th><th class="a-text-left mojo-field-type-date_interval mojo-sortable-column a-nowrap"><span title="Day Of Week">DOW</span>
 </th><th class="a-text-right mojo-field-type-rank mojo-sortable-column a-nowrap"><a class="a-link-normal a-nowrap" href="?sort=rank&amp;sortDir=asc&amp;ref_=bo_rl__resort#table" title="Rank">Rank<span class="a-letter-space"></span><span class="icon aok-relative"><i class="a-icon a-icon-expand table-sort-desc-placeholder" role="presentation"></i><i class="a-icon a-icon-collapse table-sort-asc-placeholder" role="presentation"></i></span></a></th><th class="a-text-right mojo-field-type-money moj

In [101]:
#The only thing with h1 is the movie title!
soup.find_all('h1')

[<h1 class="a-size-extra-large">Monsters, Inc.</h1>]

In [102]:
url = "https://www.boxofficemojo.com/year/2020/?ref_=bo_yl_table_2"
response = requests.get(url)
page = response.text

soup = BeautifulSoup(page, 'lxml')

In [103]:
rows = soup.find('table').find_all('tr')

In [104]:
def get_links(url, year):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    rows = soup.find('table').find_all('tr')
    curr_dict = {}
    
    for row in rows[1:]:

        try:
            theaters = int(row.find_all('td')[6].text.replace(',',''))
        except:
            theaters = 0

        if theaters < 500:
            continue
        else:
            pass
        title = row.find_all('td')[1].text
        #Update Release Date for EVERY YEAR
        release_date = row.find_all('td')[8].text+', '+year
        link_stub = row.find_all('td')[1].find('a')['href']

        curr_dict[title] = [link_stub, release_date, theaters]
    
    df = pd.DataFrame(curr_dict).T
    df.columns = ['link_stub', 'release', 'theaters']
    return df

In [105]:
def get_movie_dict(link):
    """ 
    Takes Links from existing dataframe and parses data from each page
    input: link url
    returns: (dict)
    """
    base_url = "https://www.boxofficemojo.com"
    url = base_url+link
    
    response = requests.get(url)
    print('Status_code: {}'.format(response.status_code))
    page = response.text
    soup = BeautifulSoup(page)

    
    #get movie title
    title = soup.find('h1').text

    #get description
    desc = soup.find('p', {'class':'a-size-medium'}).text

    #get gross income values
    keys = ['gross_dom', 'gross_inter', 'gross_world']
    moneys =[]
    for span in soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money'):
        moneys.append(span.text)
    gross = dict(zip(keys,moneys))

    #get movie distributor
    distr = get_movie_value(soup, 'Distri')

    #get opening income
    opening = get_movie_value(soup, 'Open')
    
    #get budget
    budget = get_movie_value(soup, 'Budget')

    #get ratings
    mpaa = get_movie_value(soup, 'MPA')

    #get runtime
    runtime = get_movie_value(soup, 'Run')

    #get genres
    genres = list(get_movie_value(soup, 'Genres').split())
    s = ''
    for each in genres:
        s+= each
        s+= ', '
    genres = s[:-2]

    #compile all the above to dict
    headers = ['title', 'desc', 'distr', 'opening', 'budget', 'mpaa', 'runtime', 'genres']
    moviedict = dict(zip(headers, [title, desc, distr, opening, budget, mpaa, runtime, genres]))
    moviedict.update(gross)
    
    return moviedict

In [106]:
import time

In [175]:
movie_stubs = pd.DataFrame()
def get_stubs():
    for year in range(2000,2020):
        url = 'https://www.boxofficemojo.com/year/{}/?grossesOption=calendarGrosses'.format(year)
        movie_stubs = pd.concat([get_links(url, str(year)), movie_stubs])
        time.sleep(2)
#get_stubs()

In [176]:
movie_gross = pd.DataFrame()

In [177]:
movie_gross