In [7]:
import pandas as pd
import numpy as np
import wikipedia as wiki
import requests
from bs4 import BeautifulSoup
import re

In [8]:
df = pd.read_csv("wiki-actors.csv")

In [9]:
del df['Position']
df.head()

Unnamed: 0,Name,Birth Date
0,Johnny Depp,6/9/63
1,Arnold Schwarzenegger,7/30/47
2,Jim Carrey,1/17/62
3,Emma Watson,4/15/90
4,Daniel Radcliffe,7/23/89


In [10]:
def get_age(Name): 
    url = wiki.page(Name).url
    
    if not url:
        return Name + " url not found"
    
    actor_url = requests.get(url).text
    soup = BeautifulSoup(actor_url,'lxml')
    actor_table = soup.find("table",{"class":"infobox biography vcard"})
    
    if not actor_table:
        actor_table = soup.find("table",{"class":"infobox vcard"})
        if not actor_table:
            return Name + " not in infobox vcard"
    
    # Check is person has died (age will be in the "Died" section)
    p = re.compile("Died")
    match = p.search(actor_table.text)
    
    if match == None:
        age = actor_table.find("span",{"class":"noprint ForceAgeToShow"}).text
        age = age.replace(u'\xa0', u' ').replace(' (age ', "").replace(")","")
    else:       
        # age_match = re.search('(?<=;)\w+', 'aged&nbsp;88')
        # age_match = re.search('(?<= )\w+', 'aged 88')
        # age = age_match.group(0)
        
        td = soup.find(text='Died').findNext('td').text
        aged = td.replace(u'\xa0', u' ').replace(' (age ', "").replace(")","")
        match = re.compile("[0-9][0-9][a-zA-Z]").search(aged).group(0)
        # print(aged)
        age = match[:2]
    
    return int(age)

In [11]:
df['Age'] = df.Name.apply(lambda x : get_age(x))
df.head(10)

Unnamed: 0,Name,Birth Date,Age
0,Johnny Depp,6/9/63,55
1,Arnold Schwarzenegger,7/30/47,71
2,Jim Carrey,1/17/62,56
3,Emma Watson,4/15/90,28
4,Daniel Radcliffe,7/23/89,29
5,Leonardo DiCaprio,11/11/74,44
6,Tom Cruise,7/3/62,56
7,Brad Pitt,12/18/63,54
8,Morgan Freeman,6/1/37,81
9,Tom Hanks,7/9/56,62


In [13]:
def get_children_no(Name): 
    url = wiki.page(Name).url
    
    if not url:
        return Name + " url not found"
    
    actor_url = requests.get(url).text
    soup = BeautifulSoup(actor_url,'lxml')
    actor_table = soup.find("table",{"class":"infobox biography vcard"})
    
    if not actor_table:
        actor_table = soup.find("table",{"class":"infobox vcard"})
        if not actor_table:
            return Name + " not in infobox vcard"
    
    p = re.compile("Children") 
    match = p.search(actor_table.text)
            
    if match != None:  # used the same format as 'Died' because inspect revealed similar pattern
        td = soup.find(text='Children').findNext('td').text
        
        if "," in td:
            num = td.split(",")[0]
        elif "(" in td:
            num = td.split("(")[0]
        elif ";" in td:
            num = td.split(";")[0]
        else:
            num = td
        
        try:
            num = int(num)
        except ValueError: # names of children instead of number, html code that bolds number, "undetermined"
            num = 'String'
            
        return num  
    
    else: # actor does not have children
        return 0 

In [25]:
for name in df['Name']:
    print(get_children_no(name))

2
5
1
0
0
0
3
6
4
4
2
3
5
3
String
0
2
String
5
String
3
3
2
2
3
4
2
5
String
1
3
Jackie Chan not in infobox vcard
6
2
1
1
1
3
1
0
1
3
3
2
2
1
0
2
5
1
2
2
2
4
2
0
3
0
9
3
2
3
0
2
0
3
2
3
2
2
2
Bruce Lee not in infobox vcard
2
0
5
6
1
0
String
1
4
1
3
11
3
1
3
3
0
0
3
1
4
2
3
1
0
3
String


In [31]:
def num_marriages(Name):
    url = wiki.page(Name).url
    
    if not url:
        return Name + " url not found"
    
    actor_url = requests.get(url).text
    soup = BeautifulSoup(actor_url,'lxml')
    actor_table = soup.find("table",{"class":"infobox biography vcard"})
    
    
    if not actor_table:
        actor_table = soup.find("table",{"class":"infobox vcard"})
    if not actor_table:
        return Name + " not in infobox vcard"

    p = re.compile("Spouse\(s\)") 
    match = p.search(actor_table.text)
    if match != None:
        spouses = soup.find(text="Spouse\(s\)").findAll('td').text
        return len(re.findall('m.', spouses))
       

In [75]:
def residence(Name):
    url = wiki.page(Name).url
    
    if not url:
        return Name + " url not found"
    
    actor_url = requests.get(url).text
    soup = BeautifulSoup(actor_url,'lxml')
    actor_table = soup.find("table",{"class":"infobox biography vcard"})
    
    
    if not actor_table:
        actor_table = soup.find("table",{"class":"infobox vcard"})
    if not actor_table:
        return Name + " not in infobox vcard"

    p = re.compile("Residence") 
    match = p.search(actor_table.text)
    if match != None:
        spouses = soup.find(text="Residence").findNext('td').text
        return spouses
        

In [76]:
for name in df['Name']:
    print(residence(name))

#    url = wiki.page("Johnny Depp").url
 #   actor_url = requests.get(url).text
  #  soup = BeautifulSoup(actor_url,'lxml')
   # actor_table = soup.find("table",{"class":"infobox biography vcard"})
    #print(soup)
    #p = re.compile("Spouse\(s\)")
   # match = p.search(actor_table.text)
    #print(match)
    #print(actor_table)
   # spouses = soup.search(text="m.")
    #print(spouses)
    
#for name in df['Name']:
 #   print(num_marriages(name))

Los Angeles, California, U.S.
Los Angeles, California, U.S.
Toronto, Ontario, Canada
None
Manhattan, New York, United States
None
Los Angeles, California, U.S.
None
None
Los Angeles, California, U.S.
None
Los Angeles, California, U.S.
Beverly Hills, California
Los Angeles, California, U.S.
None
None
Sonning Eye, Oxfordshire, England[1][2]
None
None
Gardiner, New York, U.S.
None
Malibu, California, U.S.[1]
None
New York City, New York, U.S.[1]Millbrook, New York, U.S.[1]
West Wittering, West Sussex, England
Los Angeles, California, U.S.
Los Angeles, California, U.S.
Malibu, California, U.S.
None

Los Angeles, California, U.S.
London, England

Southwest Ranches, Florida, U.S.
Jackie Chan not in infobox vcard
None
None
None
None
Manhattan, New York City
Los Angeles, California, U.S.
None
None
New York City, U.S.
None
None
Las Vegas, Nevada
Primrose Hill, London, EnglandTribeca, New York City, U.S.
Hollywood Hills, California, U.S.
None
Los Angeles, California, U.S.
Los Angeles, California

In [None]:
def get_net_worth(Name): 
    url = wiki.page(Name).url
    
    if not url:
        return Name + " url not found"
    
    actor_url = requests.get(url).text
    soup = BeautifulSoup(actor_url,'lxml')
    actor_table = soup.find("table",{"class":"infobox biography vcard"})
    
    if not actor_table:
        return Name + " not in infobox vcard"
    
    p = re.compile("Net worth") 
    match = p.search(actor_table.text)
    if match != None: # used the same format as 'Died' because inspect revealed similar pattern 
        td = soup.find(text='Net worth').findNext('td').text
        return td
    else:
        return "Not in vcard" 