In [7]:
import pandas as pd
import numpy as np
import wikipedia as wiki
import requests
from bs4 import BeautifulSoup
import re

In [8]:
df = pd.read_csv("wiki-actors.csv")

In [9]:
del df['Position']
df.head()

Unnamed: 0,Name,Birth Date
0,Johnny Depp,6/9/63
1,Arnold Schwarzenegger,7/30/47
2,Jim Carrey,1/17/62
3,Emma Watson,4/15/90
4,Daniel Radcliffe,7/23/89


In [10]:
def get_age(Name): 
    url = wiki.page(Name).url
    
    if not url:
        return Name + " url not found"
    
    actor_url = requests.get(url).text
    soup = BeautifulSoup(actor_url,'lxml')
    actor_table = soup.find("table",{"class":"infobox biography vcard"})
    
    if not actor_table:
        actor_table = soup.find("table",{"class":"infobox vcard"})
        if not actor_table:
            return Name + " not in infobox vcard"
    
    # Check is person has died (age will be in the "Died" section)
    p = re.compile("Died")
    match = p.search(actor_table.text)
    
    if match == None:
        age = actor_table.find("span",{"class":"noprint ForceAgeToShow"}).text
        age = age.replace(u'\xa0', u' ').replace(' (age ', "").replace(")","")
    else:       
        # age_match = re.search('(?<=;)\w+', 'aged&nbsp;88')
        # age_match = re.search('(?<= )\w+', 'aged 88')
        # age = age_match.group(0)
        
        td = soup.find(text='Died').findNext('td').text
        aged = td.replace(u'\xa0', u' ').replace(' (age ', "").replace(")","")
        match = re.compile("[0-9][0-9][a-zA-Z]").search(aged).group(0)
        # print(aged)
        age = match[:2]
    
    return int(age)

In [11]:
df['Age'] = df.Name.apply(lambda x : get_age(x))
df.head()

Unnamed: 0,Name,Birth Date,Age
0,Johnny Depp,6/9/63,55
1,Arnold Schwarzenegger,7/30/47,71
2,Jim Carrey,1/17/62,56
3,Emma Watson,4/15/90,28
4,Daniel Radcliffe,7/23/89,29


In [13]:
def get_children_no(Name): 
    url = wiki.page(Name).url
    
    if not url:
        return Name + " url not found"
    
    actor_url = requests.get(url).text
    soup = BeautifulSoup(actor_url,'lxml')
    actor_table = soup.find("table",{"class":"infobox biography vcard"})
    
    if not actor_table:
        actor_table = soup.find("table",{"class":"infobox vcard"})
        if not actor_table:
            return Name + " not in infobox vcard"
    
    p = re.compile("Children") 
    match = p.search(actor_table.text)
            
    if match != None:  # used the same format as 'Died' because inspect revealed similar pattern
        td = soup.find(text='Children').findNext('td').text
        
        if "," in td:
            num = td.split(",")[0]
        elif "(" in td:
            num = td.split("(")[0]
        elif ";" in td:
            num = td.split(";")[0]
        else:
            num = td
        
        try:
            num = int(num)
        except ValueError: # names of children instead of number, html code that bolds number, "undetermined"
            num = 'String'
            
        return num  
    
    else: # actor does not have children
        return 0 

In [12]:
for name in df['Name']:
    print(get_children_no(name))

('Johnny Depp', 2)
('Arnold Schwarzenegger', 5)
('Jim Carrey', 1)
0
0
0
('Tom Cruise', 3)
('Brad Pitt', 6)
('Morgan Freeman', 4)
('Tom Hanks', 4)
('Hugh Jackman', 2)
('Matt Damon', 3)
('Sylvester Stallone', 5)
('Will Smith', 3)
('Clint Eastwood', 'None')
0
('George Clooney', 2)
('Steven Spielberg', 'None')
('Harrison Ford', 5)
('Robert De Niro', 'None')
('Al Pacino', 3)
('Robert Downey Jr.', 3)
('Russell Crowe', 2)
('Liam Neeson', 2)
('Kate Winslet', 3)
('Mark Wahlberg', 4)
('Natalie Portman', 2)
('Pierce Brosnan', 5)
('Sean Connery', 'None')
('Orlando Bloom', 1)
('Dwayne Johnson', 3)
Jackie Chan not in infobox vcard
('Angelina Jolie', 6)
('Adam Sandler', 2)
('Scarlett Johansson', 1)
('Heath Ledger', 1)
('Anne Hathaway', 1)
('Jessica Alba', 3)
('Edward Norton', 1)
0
('Bradley Cooper', 1)
('Will Ferrell', 3)
('Julia Roberts', 3)
('Nicolas Cage', 2)
('Daniel Craig', 2)
('Keanu Reeves', 1)
0
('Halle Berry', 2)
('Bruce Willis', 5)
('Samuel L. Jackson', 1)
('Ben Stiller', 2)
('Tommy Lee Jon

In [14]:
def num_marriages(Name):
    url = wiki.page(Name).url
    
    if not url:
        return Name + " url not found"
    
    actor_url = requests.get(url).text
    soup = BeautifulSoup(actor_url,'lxml')
    actor_table = soup.find("table",{"class":"infobox biography vcard"})
    
    if not actor_table:
        return Name + " not in infobox vcard"
    
    spouses = soup.find(text="Spouse\(s\)").findNext('td').text
    return len(re.findall('m.', spouses))
       

In [15]:
for name in df['Name']:
    print(num_marriages(name))

AttributeError: 'NoneType' object has no attribute 'findNext'

In [None]:
def get_net_worth(Name): 
    url = wiki.page(Name).url
    
    if not url:
        return Name + " url not found"
    
    actor_url = requests.get(url).text
    soup = BeautifulSoup(actor_url,'lxml')
    actor_table = soup.find("table",{"class":"infobox biography vcard"})
    
    if not actor_table:
        return Name + " not in infobox vcard"
    
    p = re.compile("Net worth") 
    match = p.search(actor_table.text)
    if match != None: # used the same format as 'Died' because inspect revealed similar pattern 
        td = soup.find(text='Net worth').findNext('td').text
        return td
    else:
        return "Not in vcard" 