In [None]:
from bs4 import BeautifulSoup as BSoup
import csv
import re
import requests
import random

## Gathering Data

I'm scraping the data from https://www.olympic.org/ via BeautifulSoup4

In [None]:
# Base is used to reference complete urls
url_base = 'https://www.olympic.org'
# Main page has reference to all sports (links & image representations)
url_main = 'https://www.olympic.org/pyeongchang-2018'

### Sports & Event Data

#### Reference page to list all sports

In [None]:
# Get document to be passed in for soup (better/cleaner practice)
request_main = requests.get(url_main)
text_main = request_main.text
soup_main = BSoup(text_main, 'html.parser')

In [None]:
# Get the image container and the name/link container (comes in pairs)
sports_section = soup_main.find_all('section', {'class':'game-results-box'})[0]
sports_section = sports_section.find_all('ul', {'class':['countries','games2018-2']})[0]
sports_list = sports_section.find_all('li', {'class':'box'})

# Dictonary for the sports
sports_info = []
for item in sports_list:
    sport_name = item.a.text.strip()
    # Link has full url address
    sport_link = '{base}{sport}'.format(base=url_base, sport=item.a['href'])
    # ID for sport will be what is used by website to define the sport's pages
    sport_id = sport_name.lower().replace(' ','-')    
    # Save each sport into list of dictionary info
    sport_dict = {'id': sport_id, 'page': sport_link, 'name': sport_name}
    sports_info.append(sport_dict)

In [None]:
# TEST
print('Number of sports: {}'.format(len(sports_info)))
print('====================')
for sport in sports_info:
    for k, v in sport.items():
        print(k,v)
    print('-----------')

#### Reference to events in each sport

In [None]:
# For each sport, get the different events
# Save all event info into a list of events for the sport
events_info = []

for sport in sports_info:
    # Get document to be passed in for soup (better/cleaner practice)
    request_sport = requests.get(sport['page'])
    soup_main = BSoup(request_sport.text, 'html.parser')
    
    # Find the main section for all events in sports
    main_section = soup_main.find_all('div', {'class':'main-holder'})[0]
    # Find the event sections on this main page
    event_sections = main_section.find_all('section', {'class':'event-box'})
    # Get the event names & info for each event section
    for event in event_sections:
        name = event.a.text.strip()
        page = '{base}{link}'.format(base=url_base, link=event.a['href'])
        # The ID is the sport & the name used for webpage ref for event 
        # We trade brevity for ambiguity in the ID naming convention
        event_id = re.search('[^/]+$', event.a['href']).group()
        event_id = '{sport}-{event}'.format(sport=sport['id'], event=event_id)
        # Save list of info dictionary for each event
        event_dict = {'id': event_id, 'name': name, 'sport_id':sport['id'], 'page': page}
        events_info.append(event_dict)

In [None]:
# TEST
print('Number of events: {}'.format(len(events_info)))
print('====================')
for event in random.choices(events_info, k=5):
    for k, v in event.items():
        print(k,v)
    print('-----------')

## Save Data into CSVs

### Sports Data

In [None]:
# Create CSV file from list of sports dictionaries
with open('sports.csv', 'w') as sports_csv:
    writer = csv.writer(sports_csv)
    
    # Headers
    headers = ['id', 'name', 'page']
    writer.writerow(headers)
    
    # Get each dictionary assoc. with the sport
    for sport in sports_info:
        # Use only the headers (in order) to write row 
        row = [sport[key] for key in headers]
        writer.writerow(row)