## Scrape Oscars webpages

In [3]:
"""
Imports and settings
"""

import json
import re
from bs4 import BeautifulSoup
from pprint import pprint
from urllib.request import Request, urlopen, HTTPError

In [1]:
"""
Use BeautifulSoup to scrape the Oscars webpages.  We want to collect 
the names of all winning and nominated actors, actresses, and directors
since 1950.
"""

hdr = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"}
base_url  = "http://www.oscars.org/oscars/ceremonies/"

all_years = {}
for year in range(2015,1949,-1):
    page_url = base_url+str(year)
    print page_url
    req = Request(page_url, headers=hdr)
    try:
        page = urlopen(req)
    except HTTPError, e:
        print e.fp.read()
    content   = page.read()
    soup      = BeautifulSoup(content)
    this_year = {}
    award_category_list = soup.find_all(class_="view-grouping-header")
    for award_category in award_category_list:
        if 'Actor' in award_category.text or 'Actress' in award_category.text or 'Directing' in award_category.text:
            award_list = award_category.findNextSibling()
            this_category = []
            for item in award_list.text.split("\n"):
                clean_item = item.strip()
                if clean_item and 'Winner' not in clean_item and 'Nominees' not in clean_item:
                    this_category.append(clean_item)
            this_year[award_category.text] = this_category
    all_years[year] = this_year
#
with open("data/oscars.json", "w") as f:
    json.dump(all_years, f)

In [86]:
"""
Correct mistakes in the oscars dictionary that can be traced back 
to mistakes on the oscars webpage.
"""

with open("data/oscars.json", "r") as f:
    oscars_dict = json.load(f)

this_year = oscars_dict[1954]
del this_year["The Actress"]
oscars_dict[1954] = this_year

this_year = oscars_dict[1985]
awards = this_year["Directing"]
awards[4],awards[5] = awards[5],awards[4]
this_year["Directing"] = awards
oscars_dict[1985] = this_year

this_year = oscars_dict[1987]
awards = this_year["Directing"]
awards[7] = u'Roland Joff\xe9'
this_year["Directing"] = awards
oscars_dict[1987] = this_year

this_year = oscars_dict[1988]
awards = this_year["Directing"]
awards[9] = u'Lasse Hallstr\xf6m'
this_year["Directing"] = awards
oscars_dict[1988] = this_year

this_year = oscars_dict[1990]
awards = this_year["Actor in a Leading Role"]
awards[0] = u'Daniel Day-Lewis'
this_year["Actor in a Leading Role"] = awards
oscars_dict[1990] = this_year

this_year = oscars_dict[2007]
awards = this_year["Directing"]
awards[5] = u'Alejandro G. I\xf1\xe1rritu'
this_year["Directing"] = awards
oscars_dict[2007] = this_year

this_year = oscars_dict[2008]
awards = this_year["Directing"]
awards[1] = u'Joel Coen, Ethan Coen'
this_year["Directing"] = awards
oscars_dict[2008] = this_year

this_year = oscars_dict[2011]
awards = this_year["Directing"]
awards[3] = u'Joel Coen, Ethan Coen'
this_year["Directing"] = awards
oscars_dict[2011] = this_year

this_year = oscars_dict[2012]
awards = this_year["Actor in a Leading Role"]
awards[6],awards[7] = awards[7],awards[6]
this_year["Actor in a Leading Role"] = awards
oscars_dict[2012] = this_year

# Finally, we swap the order of the (director,movie) values 
# for the "Directing" keys, so as to make them agree with the other keys.
for year,this_year in oscars_dict.items():
    awards = this_year["Directing"]
    for ind in range(0,len(awards),2):
        awards[ind],awards[ind+1] = awards[ind+1],awards[ind]
    this_year["Directing"] = awards
    oscars_dict[year] = this_year

with open("data/oscars_c1.json", "w") as f:
    json.dump(oscars_dict, f)

In [4]:
"""
Check what we have.
"""

with open("data/oscars_c1.json", "r") as f:
    oscars_dict = json.load(f)
pprint(oscars_dict["2012"])

{u'Actor in a Leading Role': [u'Jean Dujardin',
                              u'The Artist',
                              u'Gary Oldman',
                              u'Tinker Tailor Soldier Spy',
                              u'Brad Pitt',
                              u'Moneyball',
                              u'Demi\xe1n Bichir',
                              u'A Better Life',
                              u'George Clooney',
                              u'The Descendants'],
 u'Actor in a Supporting Role': [u'Christopher Plummer',
                                 u'Beginners',
                                 u'Kenneth Branagh',
                                 u'My Week with Marilyn',
                                 u'Jonah Hill',
                                 u'Moneyball',
                                 u'Nick Nolte',
                                 u'Warrior',
                                 u'Max von Sydow',
                                 u'Extremely Loud & Incredib

In [8]:
"""
Using the (corrected) oscars dictionary, we now want to make 
a new dictionary keyed by Actor/Actress/Director name, and
with values equal to [# of nominations, # of wins].
"""

with open("data/oscars_c1.json", "r") as f:
    oscars_dict = json.load(f)
aad_dict = {}
for year,this_year in oscars_dict.items():
    for awards in this_year.values():
        for ind in range(0,len(awards),2):
            persons = awards[ind].split(",")
            if len(persons) > 1:
                if "Jr." in persons[1]:
                    persons = [awards[ind]]
            for person1 in persons:
                person = person1.strip()
                if aad_dict.has_key(person):
                    wins        = aad_dict[person][0]
                    nominations = aad_dict[person][1]
                    if ind:
                        nominations.append(year)
                    else:
                        wins.append(year)
                else:
                    if ind:
                        wins        = []
                        nominations = [year]
                    else:
                        wins        = [year]
                        nominations = []
                aad_dict[person] = [wins,nominations]
#pprint(aad_dict)
with open("data/aad_oscars.json", "w") as f:
    json.dump(aad_dict, f)

In [5]:
"""
Load Oscars file and print out a sample.
"""

with open("data/aad_oscars.json", "r") as f:
    aad_dict = json.load(f)
    
OrderedList = sorted([[person,oscar] for person,oscar in aad_dict.items()],\
                     key=lambda x: len(x[1][0])+len(x[1][1]),reverse=True)
for ind,[person,oscar] in enumerate(OrderedList[:6]):
    print("%i. %s (%i): %s" %(ind+1,person,len(oscar[0])+len(oscar[1]),oscar))

1. Meryl Streep (19): [[1980, 1983, 2012], [1979, 1982, 1984, 1986, 1988, 1989, 1991, 1996, 1999, 2000, 2003, 2007, 2009, 2010, 2014, 2015]]
2. Jack Nicholson (12): [[1976, 1984, 1998], [1970, 1971, 1974, 1975, 1982, 1986, 1988, 1993, 2003]]
3. Paul Newman (9): [[1987], [1959, 1962, 1964, 1968, 1982, 1983, 1995, 2003]]
4. Al Pacino (8): [[1993], [1973, 1974, 1975, 1976, 1980, 1991, 1993]]
5. Peter O'Toole (8): [[], [1963, 1965, 1969, 1970, 1973, 1981, 1983, 2007]]
6. Marlon Brando (8): [[1955, 1973], [1952, 1953, 1954, 1958, 1974, 1990]]
