In [None]:
from bs4 import BeautifulSoup as BSoup
import csv
import re
import requests
import random

## Gathering Data

I'm scraping the data from https://www.olympic.org/ via BeautifulSoup4

In [None]:
# Base is used to reference complete urls
url_base = 'https://www.olympic.org'
# Main page has reference to all sports (links & image representations)
url_main = 'https://www.olympic.org/pyeongchang-2018'

### Sports & Event Data

#### Reference page to list all sports

In [None]:
# Get document to be passed in for soup (better/cleaner practice)
request_main = requests.get(url_main)
text_main = request_main.text
soup_main = BSoup(text_main, 'html.parser')

In [None]:
# Get the image container and the name/link container (comes in pairs)
sports_section = soup_main.find_all('section', {'class':'game-results-box'})[0]
sports_section = sports_section.find_all('ul', {'class':['countries','games2018-2']})[0]
sports_list = sports_section.find_all('li', {'class':'box'})

# Dictonary for the sports
sports_info = []
for item in sports_list:
    sport_name = item.a.text.strip()
    # Link has full url address
    sport_link = '{base}{sport}'.format(base=url_base, sport=item.a['href'])
    # ID for sport will be what is used by website to define the sport's pages
    sport_id = sport_name.lower().replace(' ','-')    
    # Save each sport into list of dictionary info
    sport_dict = {'id': sport_id, 'page': sport_link, 'name': sport_name}
    sports_info.append(sport_dict)

In [None]:
# TEST
print('Number of sports: {}'.format(len(sports_info)))
print('====================')
for sport in sports_info:
    for k, v in sport.items():
        print(k,v)
    print('-----------')

#### Reference to events in each sport

In [None]:
# For each sport, get the different events
# Save all event info into a list of events for the sport
events_info = []

for sport in sports_info:
    # Get document to be passed in for soup (better/cleaner practice)
    request_sport = requests.get(sport['page'])
    soup_main = BSoup(request_sport.text, 'html.parser')
    
    # Find the main section for all events in sports
    main_section = soup_main.find_all('div', {'class':'main-holder'})[0]
    # Find the event sections on this main page
    event_sections = main_section.find_all('section', {'class':'event-box'})
    # Get the event names & info for each event section
    for event in event_sections:
        name = event.a.text.strip()
        page = '{base}{link}'.format(base=url_base, link=event.a['href'])
        # The ID is the sport & the name used for webpage ref for event 
        # We trade brevity for ambiguity in the ID naming convention
        event_id = re.search('[^/]+$', event.a['href']).group()
        event_id = '{sport}-{event}'.format(sport=sport['id'], event=event_id)
        
        # Get sex by seeing if it's men's, women's, or mixed (definitions checked)
        sex_categories = {'mixed':'mixed', 'gundersen':'men', 'man':'men', 'men':'men', 
                          'mens':'men', 'women':'women', 'womens':'women', 'ladies':'women'}
        # Default to mixed event
        event_sex = 'mixed'
        is_assigned = False
        # Loop over each category (time consuming but necessary)
        for sex in sex_categories.keys():
            
            # Check if any of the words is the sex-term
            if sex in event_id.split('-'):
                # If there was more than one label applied, it's a mixed event 
                if is_assigned:
                    event_sex = 'mixed'
                    break
                
                event_sex = sex_categories[sex]
                is_assigned = True       

        
        
        # Save list of info dictionary for each event
        event_dict = {'id': event_id, 'name': name, 'sport_id':sport['id'], 'sex':event_sex, 'page': page}
        events_info.append(event_dict)

In [None]:
# TEST
print('Number of events: {}'.format(len(events_info)))
print('====================')
for event in random.choices(events_info, k=5):
    for k, v in event.items():
        print(k,v)
    print('-----------')

#### Reference to results for each event

In [None]:
results = {}
team_events = []
# Replace ranking for Gold, Silver, Bronze, and no ranking (-1)
medal_ranks = {'G':1, 'S':2, 'B':3}
for event in events_info:
    # Get document to be passed in for soup (better/cleaner practice)
    request_event = requests.get(event['page'])
    soup_results = BSoup(request_event.text, 'html.parser')
    # Find the main section and table for all events in sports
    try:
        results_section = soup_results.find_all('section', {'class':'table-box'})[0]
    except:
        print('No results found: {}'.format(event['page']))
    results_table = results_section.find_all('table')[0]
    # Get the headers for the results
    headers = results_table.find('thead')
    result_headers = [h.text for h in headers.find_all('th')]
    # If team event, save to do different processing
    if 'team' in result_headers:
        team_events.append(event)
        continue
    # Get the results
    competitors = results_table.find('tbody')
    # Ignore team events' with special tier in tbody
    competitors = competitors.find_all('tr', {'class': None})
    # Each competitor (can be a team) has a result line
    competitors_info = []
    for c in competitors:
        # Get competition info from each tier
        c_dict = {}
        c = c.find_all('td')
        # Save the competitor and remove extra new lines (was used on website formatting)
        c_dict = {h: cc.text.strip().replace('\n','').replace('\r','') for h,cc in zip(result_headers,c)}
        # Replace rank as number (integers)
        ranking = c_dict.get('Rank')
        try:
            # Get the integer rank if there is a decimal
            c_dict['Rank'] = int(ranking.split('.')[0])
        # If not a number either a medalist or something else (no ranking)
        except: 
            # -1 means no obvious ranking
            c_dict['Rank'] = medal_ranks.get(ranking, -1) 
        competitors_info.append(c_dict)
    # Save all event results into one dictionary
    results[event['id']] = competitors_info

In [None]:
# TEST
# print(results.keys())
# [print(r) for r in results['alpine-skiing-mens-downhill']]
for r,dicts in results.items():
    print(r)
    print(dicts)
    break

## Save Data into CSVs

### Sports Data

In [None]:
# Create CSV file from list of sports dictionaries
with open('data/sports.csv', 'w') as sports_csv:
    writer = csv.writer(sports_csv)
    
    # Headers
    headers = ['id', 'name', 'page']
    writer.writerow(headers)
    
    # Get each dictionary assoc. with the sport
    for sport in sports_info:
        # Use only the headers (in order) to write row 
        row = [sport[key] for key in headers]
        writer.writerow(row)

### Events Data

In [None]:
# Create CSV file from list of events dictionaries
with open('data/events.csv', 'w') as events_csv:
    writer = csv.writer(events_csv)
    
    # Headers
    headers = ['id', 'name', 'sport_id', 'sex', 'page']
    writer.writerow(headers)
    
    # Get each dictionary assoc. with the sport
    for event in events_info:
        # Use only the headers (in order) to write row 
        row = [event[key] for key in headers]
        writer.writerow(row)

### Results Data

In [None]:
# Header w/ event in front
result_headers.insert(0, 'event')
# Create CSV file from list of events dictionaries
with open('data/results.csv', 'w') as results_csv:
    writer = csv.writer(results_csv)
    writer.writerow(result_headers)
    # Get each dictionary assoc. with each event
    for event,event_results in results.items():
        # Iterate over each result of event
        for result in event_results:
            # Use only the headers (in order) to write row 
            row = [result.get(key) for key in result_headers[1:]]
            row.insert(0,event)
            writer.writerow(row)