In [3]:
import tablescraper as scrape
import requests
import re
import pandas as pd

In [4]:
# bdays = scrape.scrape_elems(
#     ['https://en.wikipedia.org/wiki/Robert_Downey_Jr.', 'https://en.wikipedia.org/wiki/Paul_Bettany'],
#     '//*[@class="bday"]/text()')

# bdays

## Get wikipedia page by actor_name, then get birthday

In [5]:
def wikipedia_urls(actor_names):
    """
    Returns wikipedia search urls, formatted as a dict like {actor_name:wiki_page_url, ...}
    Warning: will return a 'disambiguation page' on some searches
    """

    actor_names = scrape.to_sequence(actor_names)
    final_urls = {}
    
    for actor_name in actor_names:

        url = 'https://en.wikipedia.org/w/index.php?search={name}&title=Special%3ASearch&go=Go'.format(
            name=re.sub(r"[^A-Za-z+]+", '', actor_name.lower().replace(' ','+')))

        final_urls[actor_name] = url

    return final_urls

actor_revenue = pd.read_pickle('actor_revenue.pickle')
actor_revenue.head()

Unnamed: 0,rank,title,studio,adjusted_gross,unadjusted_gross,release,actor_id,actor_name
0,-,Marvel's The Avengers,BV,668866600,623357910,2012-05-04,paulbettany,Paul Bettany
1,1,Avengers: Age of Ultron,BV,465684200,459005868,2015-05-01,paulbettany,Paul Bettany
2,-,Iron Man 3,BV,424632700,409013994,2013-05-03,paulbettany,Paul Bettany
3,-,Iron Man,Par.,385808100,318412101,2008-05-02,paulbettany,Paul Bettany
4,-,Iron Man 2,Par.,341908200,312433331,2010-05-07,paulbettany,Paul Bettany


In [None]:
# actor_names = actor_revenue.actor_name.unique()

# wiki_urls = wikipedia_urls(actor_names)
# birthdays = scrape.scrape_elems(wiki_urls, '//*[@class="bday"]/text()')

# print len(birthdays)
# print zip(birthdays, actor_names)[:5]

In [None]:
actor_names = actor_revenue.actor_name.unique()

print len(actor_names)

wiki_urls = wikipedia_urls(actor_names)
wiki_data = scrape.scrape_multi_elems(wiki_urls, 
    {
        'bday':'//*[@class="bday"]/text()',
        'text':'normalize-space(string(//div[@id="mw-content-text"]))',
        'categories':'(//*[@id="catlinks"]//a)[position()>1]/text()'
    },
    keep_trees=True)

print len(wiki_data)

In [11]:
# pickle can't handle fancy dictionaries, dill to the rescue
import dill

# with open('wiki_data_dict.dill', 'wb') as f:
#     dill.dump(wiki_data, f)
    
# !ls

######################################

# with open('wiki_data_dict.dill') as f:
#     wiki_data = dill.load(f)


In [12]:
test_name = actor_names[0]

print test_name

print wiki_data[test_name]['bday']
print wiki_data[test_name]['text'][:100]
print wiki_data[test_name]['categories']
print wiki_data[test_name]['tree']

print type(wiki_data[test_name]['tree'])

Paul Bettany
['1971-05-27']
An automated process has detected links on this page on the local or global blacklist. If the links 
['1971 births', 'Male actors from London', 'Alumni of the Drama Centre London', 'British buskers', 'English male film actors', 'English male stage actors', 'British atheists', 'British expatriates in the United States', 'Living people', 'People from Harlesden', 'Royal Shakespeare Company members', '20th-century British male actors', '21st-century British male actors', "People from Shepherd's Bush", 'Tagged pages containing blacklisted links', 'Articles with hCards', 'All articles with unsourced statements', 'Articles with unsourced statements from March 2014', 'Commons category with page title same as on Wikidata', 'Use British English from August 2010', 'Use dmy dates from June 2011', 'Wikipedia articles with VIAF identifiers', 'Wikipedia articles with LCCN identifiers', 'Wikipedia articles with ISNI identifiers', 'Wikipedia articles with GND identifiers', '

AssertionError: invalid Element proxy at 140286489569776

I want to use the Wikipedia categories as features -- they'll be one-hot columns aka dummy variables. 

So first I need to figure out all the unique categories, then for each actor change `categories` to a pandas Series indexed by the category name, with True for 'has category' and False for 'does not have category'.

In [13]:
all_categories = set()

for actor_name, actor_data in wiki_data.iteritems():
    all_categories.update(actor_data['categories'])

all_categories = list(all_categories)
print 'got %i unique categories\n' % len(all_categories)
print '\n'.join(all_categories[:10])

got 4826 unique categories

Deaths from fire in the United States
Detroit Country Day School alumni
21st-century Australian actresses
People from Pound Ridge, New York
Female comics writers
Articles containing potentially dated statements from March 2014
Articles containing potentially dated statements from March 2016
Use Australian English from February 2014
O. Henry Award winners
Scottish nationalists


In [14]:
for actor_name, actor_data in wiki_data.iteritems():
    for category in all_categories:
        wiki_data[actor_name][category] = category in actor_data['categories']

In [20]:
def percent_male_from_text(text):
    male_pronoun_count = 0
    female_pronoun_count = 0
    
    male_pronouns = ['him','his']
    female_pronouns = ['she','her']
    
    text = text.lower()
    
    # search for space-buffered gendered pronouns
    for pron in male_pronouns:
        male_pronoun_count += text.count(' %s ' % pron)

    for pron in female_pronouns:
        female_pronoun_count += text.count(' %s ' % pron)
        
    if male_pronoun_count + female_pronoun_count == 0:
        # return NaN if there are no pronouns found
        return pd.np.nan    
    
    return (1.0*male_pronoun_count)/(male_pronoun_count + female_pronoun_count)

In [26]:
wiki_df = pd.DataFrame.from_dict(wiki_data, orient='index').drop(['tree','categories'], axis=1)

wiki_df['male_txt'] = wiki_df.text.apply(percent_male_from_text)

wiki_df.drop('text', axis=1, inplace=True)

wiki_df.male_txt.head()

50 Cent                 NaN
Aaron Eckhart      0.937500
Aaron Johnson      0.928571
Abbie Cornish      0.038462
Abigail Breslin    0.052632
Name: male_txt, dtype: float64

In [48]:
# if they're "most likely" male, male==True
# allow NaN's to propagate
wiki_df.male_txt = wiki_df.male_txt.round()
wiki_df.male_txt.head()

50 Cent           NaN
Aaron Eckhart       1
Aaron Johnson       1
Abbie Cornish       0
Abigail Breslin     0
Name: male_txt, dtype: float64

I don't necessarily have to do this, but I'll remove the features that are too sparse by setting a threshold. If less than `category_cutoff` people have feature X, that feature will be removed.

In [49]:
category_cutoff = 10

summed_categories = wiki_df.sum()
unimportant_categories = summed_categories[summed_categories < category_cutoff]

print '%i unimportant categories to drop (less than than %i members)' % (len(unimportant_categories), category_cutoff)
unimportant_categories.head()

4487 unimportant categories to drop (less than than 10 members)


Detroit Country Day School alumni                                   True
American punk rock musicians                                        True
People from Pound Ridge, New York                                      2
Female comics writers                                                  2
Articles containing potentially dated statements from March 2014    True
dtype: object

In [50]:
# filter out the unimportand categories

filtered_wiki_df = wiki_df.drop(unimportant_categories.index, axis=1)
filtered_wiki_df.head()

Unnamed: 0,Canadian male voice actors,21st-century Australian actresses,Canadian male television actors,English people of Irish descent,Commanders of the Order of the British Empire,Wikipedia protected pages without expiry,American atheists,1992 births,English male television actors,Best Supporting Actress Academy Award winners,...,Australian television actresses,Articles with dead external links from October 2010,Best Miniseries or Television Movie Actor Golden Globe winners,British Shakespearean actresses,American television directors,American male child actors,Outstanding Performance by a Female Actor in a Miniseries or Television Movie Screen Actors Guild Award winners,Articles with unsourced statements from February 2016,Articles with unsourced statements from April 2016,male_txt
50 Cent,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,
Aaron Eckhart,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1.0
Aaron Johnson,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,1.0
Abbie Cornish,False,True,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,0.0
Abigail Breslin,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.0


In [66]:
# take bday out of lists and convert to datetime
def first_el_in_list(l):
    if len(l) > 0:
        return l[0]
    else:
        return pd.np.nan

filtered_wiki_df.bday = pd.to_datetime(filtered_wiki_df.bday.apply(first_el_in_list))
filtered_wiki_df.bday.head()

50 Cent                  NaT
Aaron Eckhart     1968-03-12
Aaron Johnson     1990-06-13
Abbie Cornish     1982-08-07
Abigail Breslin   1996-04-14
Name: bday, dtype: datetime64[ns]

Now I want to **join** actor biographic info from `filtered_wiki_df` to actor movie info from `actor_revenue`

In [67]:
actor_full = actor_revenue.set_index('actor_name').join(filtered_wiki_df)
actor_full.head()

Unnamed: 0,rank,title,studio,adjusted_gross,unadjusted_gross,release,actor_id,Canadian male voice actors,21st-century Australian actresses,Canadian male television actors,...,Australian television actresses,Articles with dead external links from October 2010,Best Miniseries or Television Movie Actor Golden Globe winners,British Shakespearean actresses,American television directors,American male child actors,Outstanding Performance by a Female Actor in a Miniseries or Television Movie Screen Actors Guild Award winners,Articles with unsourced statements from February 2016,Articles with unsourced statements from April 2016,male_txt
50 Cent,1,Spy,Fox,116867700,110825712,2015-06-05,50cent,False,False,False,...,False,False,False,False,False,False,False,False,False,
50 Cent,2,Southpaw,Wein.,55265500,52421953,2015-07-24,50cent,False,False,False,...,False,False,False,False,False,False,False,False,False,
50 Cent,3,Righteous Kill,Over.,48566600,40081410,2008-09-12,50cent,False,False,False,...,False,False,False,False,False,False,False,False,False,
50 Cent,4,Get Rich or Die Tryin',Par.,42048300,30985352,2005-11-09,50cent,False,False,False,...,False,False,False,False,False,False,False,False,False,
50 Cent,5,Escape Plan,LG/S,26202200,25135965,2013-10-18,50cent,False,False,False,...,False,False,False,False,False,False,False,False,False,


In [71]:
# finally add a column for actor age on release date
actor_full['age_on_release'] = actor_full.release - actor_full.bday

actor_full[['age_on_release','bday','release','title','adjusted_gross','male_txt']].tail()

Unnamed: 0,age_on_release,bday,release,title,adjusted_gross,male_txt
Zooey Deschanel,9038 days,1980-01-17,2004-10-15,Eulogy,105200,0
Zooey Deschanel,8499 days,1980-01-17,2003-04-25,Manic,100900,0
Zooey Deschanel,9934 days,1980-01-17,2007-03-30,Live Free or Die,17500,0
Zooey Deschanel,10368 days,1980-01-17,2008-06-06,The Go-Getter,14500,0
Zooey Deschanel,10198 days,1980-01-17,2007-12-19,Flakes,1000,0


In [72]:
print '%i rows, %i columns' % actor_full.shape

16416 rows, 349 columns


Enable to pickle:

In [73]:
# actor_full.to_pickle('actor_full.pickle')

# The stuff under this uses old vars...

In [None]:
bday_series = pd.Series(data=pd.to_datetime(birthdays), index=actor_names, name='birthday')
print 'Got %i birthdays out of %i' % (len(bday_series[~bday_series.isnull()]), len(actor_names))
bday_series.head()

In [None]:
actor_rev_bday = actor_revenue.set_index('actor_name').join(bday_series)
actor_rev_bday.tail()

Which birthdays did we miss?

In [None]:
actor_rev_bday[actor_rev_bday.birthday.isnull()].index.unique()

It turns out these names will land you in a wikipedia disambiguation page, eg https://en.wikipedia.org/wiki/Adam_Scott

I *could* re-scrape these names and follow the link that looks like `Adam Scott (actor)` to find the actor's page... but I'll ignore them for now.

# Gender? 

You could calculate the "her"+"she" to "his"+"he" ratio in the Wikipedia page text...

Or you could search the categories for 'male'/'female' ratio.

Or, the existing categories might be enough!