In [None]:
from bs4 import BeautifulSoup 
import csv
import re
import requests

## Gathering Data

I'm scraping the data from https://www.olympic.org/ via BeautifulSoup4

In [None]:
# Useful references to directories (all directories end with '/')
dr_connector = 'en/'
dr_imgs = 'resOWG2018/img/'
dr_results = 'https://www.olympic.org/pyeongchang-2018/results/'

### Sports & Events Data

#### Reference page to list of all sports

In [None]:
# Schedule page has reference to all sports in table
url_schedule = 'https://www.olympic.org/pyeongchang-2018/results/en/general/competition-schedule.htm'

# Get document to be passed in for soup (better/cleaner practice)
request_schedule = requests.get(url_schedule)
text_schedule = request_schedule.text

# Get all sports from schedule page
soup_schedule = BeautifulSoup(text_schedule, 'html.parser')

In [None]:
# Get the image container and the name/link container (comes in pairs)
sports = soup_schedule.find_all('td', {'class':['disciplinePicture', 'styleLeft']})

# Dictonary for the sports
sports_info = []

# Go every other since it always matches to one sport
for img, name in zip(sports[::2], sports[1::2]):
    
    # Skip for the ceremony image (and other errors)
    if name.a == None:
        continue
    
    
    # Get the image link which has the sports ID 
    # form: ../../resOWG2018/img/sports/CER.png
    sport_img_link = img.img['src']
    
    # Get ID from link
    match = re.search('(\w+)\.png$', sport_img_link)
    sport_id = match.group(1)
    
    # Get image as a link
    sport_img = '{}{}sports/{}.png'.format(dr_results, dr_imgs, sport_id)
    
    
    # Get sport's schedule page
    match = re.search('(([-\w]+)\/daily-schedule.htm)$', name.a['href'])    
    sport_schedule = '{}{}{}'.format(dr_results, dr_connector, match.group(1))

    # Get sport's full name from link (words separated by -)
    sport_name = match.group(2)
    
    sport_dict = {'id': sport_id, 'img': sport_img, 'schedule': sport_schedule, 'name': sport_name}
    sports_info.append(sport_dict)

In [None]:
# Test data
print(len(sports_info))

for sport in sports_info:
    for k, v in sport.items():
        print(k,v)
    print()

#### Reference page to sports' different events

In [None]:
# For each sport, get the different events
# Save all event info into a list of events for the sport
events_info = []

for sport in sports_info:
    sport_id = sport['id']
    sport_name = sport['name']
    
    # Get HTML text from sport's list of events
    url_event = '{}{}{}/sport-entries.htm'.format(dr_results, dr_connector, sport_name)
    request_event = requests.get(url_event)
    text_event = request_event.text
    soup_event = BeautifulSoup(text_event, 'html.parser')
    
    # Look for all events for this sport
    events = soup_event.find_all('li', class_='entriesByEventElem')
    

    for event in events:
        
        # Get info from event page
        event_page_link = event.a['href']
        match = re.search('\/(entries-by-event-([\w-]*)\.htm)$', event_page_link)
        
        # Get the web page for the event 
        event_page = '{}{}{}/{}'.format(dr_results, dr_connector, sport_name, match.group(1))
        event_name = match.group(2)
        
        # Process name so it is easier for reading
        event_readable = event.a.text.strip().lower()
        
        # Get sex by seeing if it's men's, women's, or mixed (definitions checked)
        sex_categories = {'mixed':'mixed', 'gundersen':'men', 'man':'men', 'men':'men', 'women':'women', 'ladies':'women'}
        # Default to mixed event
        event_sex = 'mixed'
        is_assigned = False
        # Loop over each category (time consuming but necessary)
        for sex in sex_categories.keys():
            
            
            # Check if any of the words is the sex-term
            if sex in event_name.split('-'):
                # If there was more than one label applied, it's a mixed event 
                if is_assigned:
                    event_sex = 'mixed'
                    break
                
                event_sex = sex_categories[sex]
                is_assigned = True
            
            
        
         
        
        # Save event info into list of events (for this sport)
        event_info = {'name': event_name, 'sport_id':sport_id, 'sex': event_sex, 'readable_name': event_readable, 'page': event_page}
        events_info.append(event_info)


In [None]:
# Test for events
len(events_info)
for i in range(5):
    for k,v in events_info[i].items():
        print(k,v)
    print()

# for e in events_info:
#     print(e['sex'],e['name'])
#     print(e['page'])

#### Reference events' pages to get athlete info

In [None]:
# Create a way to pull data from athlete's div
def get_athlete_info(athlete_div):
    # Country is 3 country code
    athlete_country = athlete_div['attrcountrycode']

    # picture is within another div
    athlete_photo_div = athlete_div.find_all('div', class_='playerTagContainerPhoto')[0]
    athlete_photo = athlete_photo_div.img['src']
    # Create URL for photo
    match = re.search('\.\./\.\./(.*)$', athlete_photo)
    athlete_photo = match.group(1)
    athlete_photo = '{}{}'.format(dr_results, athlete_photo)

    # ID is numbers from picture name
    athlete_id = re.search('\/(\d+)\..*$', athlete_photo).group(1)

    # name & link to profile
    athlete_profile_link = athlete_div.find_all('div', class_='nameLine')[0].a
    athlete_name = athlete_profile_link.text.lower()

    # Create URL for profile page
    match = re.search('\.\./\.\./en/(.*)$', athlete_profile_link['href'])
    athlete_page = match.group(1)
    athlete_page = '{}{}{}'.format(dr_results, dr_connector, athlete_page)
    
    
    # Create dictionary for athlete
    athlete_dict = {
        'id':athlete_id, 
        'name':athlete_name, 
        'country_id':athlete_country, 
        'photo':athlete_photo, 
        'profile':athlete_page
    }
    
    return(athlete_dict)


In [None]:
# TODO: make rankings table for event
# event, sport(id), athlete(id), placed, result
team_events = []

# TODO: get all athlete info (including teams!)

# Keep athlete data in its own table
# TODO: get more data from athlete's profile
athletes_info = []

## go by each event 
for event in events_info:
    # Use entries page to get list of athletes ranked
    entries_page = event['page']
    ranking_page = entries_page.replace('entries-by-event','medals-and-ranking')
    ## Get document to be passed in for soup (better/cleaner practice)
    request_ranking = requests.get(ranking_page)
    text_ranking = request_ranking.text
    soup_ranking = BeautifulSoup(text_ranking, 'html.parser')

    # Get all entries in table
    athlete_rank_table = soup_ranking.find_all('tr', {'class':['Res2', 'Res1']})
    
    # Do this for individual sports (need something different teams)
    try:
        for row in athlete_rank_table:
            # All entry information in cols -> [medal info, rank, athlete info, result] 
            cols = row.find_all('td')
            # skip the first column (just medal info)
            # 2nd column only has rank (note that rank can be blank)
            rank = cols[1].text.strip()

            # 3rd colum is athlete data in divs
            athlete_div = cols[2].div

            athlete_dict = get_athlete_info(athlete_div)

            # TODO: get result of event for each entry (may not be avail)

            # Add athletes into event 
            # Check athlete doesn't already exist in athlete table by checking id
            if athlete_dict['id'] in [athlete['id'] for athlete in athletes_info]:
                continue
            # Add in info
            athletes_info.append(athlete_dict)

    except:
        print('Possible team event? \t {}'.format(entries_page))
        # Save this as a team event
        team_events.append(event)



In [None]:
print(len(athletes_info))
print(len(team_events))

#### Save sports data into file

In [None]:
# Create CSV file from list of sports dictionaries
with open('sports.csv', 'w') as sports_csv:
    writer = csv.writer(sports_csv)
    
    # Headers
    headers = ['id', 'name', 'img', 'schedule']
    writer.writerow(headers)
    
    # Get each dictionary assoc. with the sport
    for sport in sports_info:
        # Use only the headers (in order) to write row 
        row = [sport[key] for key in headers]
        writer.writerow(row)

### Get Country Data

#### Reference page to list of all countries

In [None]:
url_list_countries = 'https://www.olympic.org/pyeongchang-2018/results/en/general/nocs-list.htm'

# Get document to be passed in for soup (better/cleaner practice)
request_countries = requests.get(url_list_countries)
text_countires = request_countries.text

# Get all countries from main page
soup_countries = BeautifulSoup(text_countires, 'html.parser')
countries = soup_countries.find_all('div', class_='CountriesListItem')

#### Country pages, flags, IDs, etc. 

In [None]:
# List of dictionaries of countries info 
countries_info = []

# Iterate over countries and save info
for country in countries:
    # Country code gives an identifier of 3 character
    country_id = country['attrcountrycode']

    # Country web page
    country_page_link = country.a['href']
    match = re.search('\/(noc-entries-([-\w]+)\.htm)$',country_page_link)
    # group(1) form: noc-entries-country.htm
    country_page = '{}{}general/{}'.format(dr_results, dr_connector, match.group(1))
    
    # Country full name
    country_name = match.group(2) 

    # Flag image =>
    country_flag = '{}resCOMMON/img/flags/{}.png'.format(dr_results,country_id)
    
    # Create a dictionary to be saved
    country_dict = {'code_id':country_id, 'name':country_name, 'page':country_page, 'flag':country_flag}
    countries_info.append(country_dict)

In [None]:
# Test data
print(len(countries_info))


for key, info in countries_info[0].items():
    print(key, info)
print()
for key, info in countries_info[-1].items():
    print(key, info)

#### Save country data into file

In [None]:
# Create CSV file from list of countries dictionaries
with open('countries.csv', 'w') as countries_csv:
    writer = csv.writer(countries_csv)
    
    # Headers
    headers = ['code_id', 'name', 'flag', 'page']
    writer.writerow(headers)
    
    # Get each dictionary assoc. with the sport
    for country in countries_info:
        # Use only the headers (in order) to write row 
        row = [country[key] for key in headers]
        writer.writerow(row)