# COGS 108 - Final Project


## Data Cleaning/Pre-processing

The first step in our project is to import our dependencies

In [1]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from collections import deque

First, we will define a helper function that we will use to return the celebrity name and death date, along with any other features we would like to extract. 

If we pass in any page that isn't a wikipedia page of a deceased notable celebrity (as per our definition), the function will return `None`.

In [2]:
def get_death_date(soup):
    table_selector = soup.select('table.infobox.vcard')

    if table_selector:
        table = table_selector[0]
        
        # Important features we would like to handle specially
        name = table.find('span', class_='fn')
        born = table.find('span', class_='bday')
        died = table.find('span', class_='dday deathdate')
        
        # Only return if the person has a name, has born data, and died
        if name and born and died:
            data =  {'Name': name.text, 'Born': born.text, 'Died': died.text}
            
            # Extract other features that we would like to keep.
            rows = table.find_all('tr')
            for row in rows:
                header = row.find('th')
                datum = row.find('td')
                if header and datum and header.text not in data:
                    data[header.text.replace('\xa0', ' ').strip()] = datum.get_text(' ', strip=True)
                    
            return data
        
    return None

Then, we will recursively traverse the links through Breadth First Search and visit all Wikipedia links reachable from the 'List of Celebrities' page. 

In [3]:
# Breadth First Search traversal.

wikipedia_url = 'https://en.wikipedia.org'
starting_endpoint = '/wiki/Lists_of_celebrities'
click_limit = 2

# Dictionary of visited links
# 'url' -> number of clicks taken to get to link
links_visited = {starting_endpoint: 0} 
# Queue of links to visit
links_queue = deque()
links_queue.appendleft(starting_endpoint)

# Create a buffer of dataframes
data_buffer = deque()

# Debug info
num_visited = 0

# Find all links on the page that lead to another wikipedia page.
while links_queue:
    link = links_queue.pop()
    page = requests.get(wikipedia_url + link)
    soup = BeautifulSoup(page.content, 'html.parser')
    num_visited += 1
    
    # Debug info
    if links_visited[link] >= click_limit - 1:
        num_links = len(links_visited)
        num_deads = len(data_buffer)
        percentage = format(100.0*(float(num_visited)/num_links), '.2f')
        print((percentage + "% - " + str(len(links_queue)) + " left, " + str(num_deads) + " recorded.").ljust(60),  end='\r', flush=True)
    else:
        print("Loading initial pages...", end='\r', flush=True)
    
    # See if page is of deceased person - if so, get the data
    results = get_death_date(soup)
    
    # Append to dataframe buffer if we've found a match
    if results:
        data_buffer.append(pd.DataFrame([results]))
    
    # Break early on click limit or if page isn't a list
    if links_visited[link] == click_limit or ("List" not in link and "Category:" not in link):
        continue
    
    content = soup.find('div', id='content')
    
    # Continue BFS on links in the page
    if content:
        for candidate in content.findAll('a'):
            if 'href' in candidate.attrs:
                link_in_page = urlparse(candidate.attrs['href']).path
                                
                # Add to queue if it's a link we haven't seen before.
                if link_in_page not in links_visited and link_in_page.startswith('/wiki/') and 'File' not in link_in_page:
                    links_visited[link_in_page] = links_visited[link] + 1
                    links_queue.appendleft(link_in_page)

# Concatenate all of the buffer into the dataframe that holds our celebrity data
df_celebrities = pd.concat(data_buffer)

100.00% - 0 left, 2966 recorded.                            

Since this takes such a long time, we will run this once and cache it into a JSON file. All subsequent operations will be run off of the loaded JSON file.

In [5]:
with open('celebrities.json', 'w') as file:
    file.write(df_celebrities.to_json(orient='records'))

Now, we can read the file back in as the JSON in order to circumvent the crawling process for future usage.

In [7]:
df_celebrities2 = pd.read_json("celebrities.json", orient="records")

In [10]:
print(df_celebrities2.shape)

(2966, 228)
(2966, 228)
