## Scrape the BoxOfficeMojo website for movie information

In [2]:
'''
Scrape BoxOfficeMojo website and create a pickle file with the results.
(Here we do it for the year 2004.)
'''
import urllib2
from bs4 import BeautifulSoup
import re
from pprint import pprint
import pickle
#
def get_movie_value(soup, field_name):
    """
    Takes a string attribute of a movie on the page, and returns the string
    in the next sibling object (the value of that attribute).
    """
    obj = soup.find(text = re.compile(field_name))
    if not obj:
        return None
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text
    else:
        return None
#
def get_movie_value_list(soup, field_name):
    """
    Movie attributes such as Director, Actor, Producer, and Writer 
    can be located by searching for the appropriate href link and 
    then parsing subsequent elements.
    """
    value_list  = []
    value_list2 = []
    obj1 = soup.find(href=re.compile(field_name))
    if obj1:
        obj2 = obj1.find_next("td")
        if obj2:
            obj3 = re.compile("<.*?>").sub(";", unicode(obj2))
            obj4 = obj3.split(";")
            [value_list.append(string.strip()) for string in obj4 if string.strip()]
            for ind, item in enumerate(value_list):
                if "(" in item and ")" in item and ind > 0:
                    value_list2[-1] = value_list2[-1] + " " + item
                else:
                    value_list2.append(item)
    return value_list2
#
base_url  = "http://www.boxofficemojo.com"
page1_url = base_url+"/yearly/chart/?page=1&yr=2004&p=.htm"
page2_url = base_url+"/yearly/chart/?page=2&yr=2004&p=.htm"
page3_url = base_url+"/yearly/chart/?page=3&yr=2004&p=.htm"
page4_url = base_url+"/yearly/chart/?page=4&yr=2004&p=.htm"
page5_url = base_url+"/yearly/chart/?page=5&yr=2004&p=.htm"
page6_url = base_url+"/yearly/chart/?page=6&yr=2004&p=.htm"
page7_url = base_url+"/yearly/chart/?page=7&yr=2004&p=.htm"
page_url_list = [page1_url,page2_url,page3_url,page4_url,page5_url,page6_url,page7_url]
#

# Make a list of url's for all the movies listed on the url pages we are interested in:
nmovies  = 0
url_list = []
throwout = ['/movies/?id=fast7.htm', '/movies/', '/movies/?ref=ft']
for page_url in page_url_list:
    page = urllib2.urlopen(page_url)
    soup = BeautifulSoup(page)
    for link in soup.find_all(href=re.compile("movies")):
        movie_url = link.get('href')
        if movie_url not in throwout:
            url_list.append(movie_url)
            nmovies += 1
print 'Number of movies found = %i' % nmovies
#
# Visit each movie page and pick up the information we need. 
# Make a dictionary of dictionaries:
rank = 0
movie_dict = {}
for movie_url in url_list:
    page_url = base_url+movie_url
    page     = urllib2.urlopen(page_url)
    soup     = BeautifulSoup(page)

    title_string = soup.find("title").text.split("(")[0].strip()
    dtg          = get_movie_value(soup, "Domestic Total")
    runtime      = get_movie_value(soup, "Runtime")
    rating       = get_movie_value(soup, "MPAA Rating")
    release_date = get_movie_value(soup, "Release Date")
    genre        = get_movie_value(soup, "Genre:")
    prod_budget  = get_movie_value(soup, "Production Budget")
    distributor  = get_movie_value(soup, "Distributor")
    director     = get_movie_value_list(soup, "Director")
    writer       = get_movie_value_list(soup, "Writer")
    actors       = get_movie_value_list(soup, "Actor")
    producers    = get_movie_value_list(soup, "Producer")
    composers    = get_movie_value_list(soup, "Composer")

    rank += 1
    thismovie = {"Title": title_string, "Domestic Total": dtg, "Run Time": runtime,
                 "MPAA Rating": rating, "Release Date": release_date, "Genre": genre,
                 "Production Budget": prod_budget, "Distributor": distributor,
                 "Director": director, "Writers": writer, "Actors": actors, 
                 "Producers": producers, "Composer": composers}
    movie_dict[rank] = thismovie
#
with open('2004_all_movie_dict.pkl', 'w') as f:
    pickle.dump(movie_dict, f)
#pprint(movie_dict)

In [4]:
'''
Test result of previous cell.
'''
with open('2004_all_movie_dict.pkl', 'r') as f:
    dict2004 = pickle.load(f)
pprint(dict2004)