In [None]:
from bs4 import BeautifulSoup 
import re
import requests

## Gathering Data

I'm scraping the data from https://www.olympic.org/ via BeautifulSoup4

In [None]:
# Useful references to directories (all directories end with '/')
#TODO -> change to 'en/' and change references to connector
dr_connector = 'en/general/'
dr_imgs = 'resOWG2018/img/'
dr_results = 'https://www.olympic.org/pyeongchang-2018/results/'

### Get Sports

#### Reference page to list of all sports

In [None]:
# Schedule page has reference to all sports in table
url_schedule = 'https://www.olympic.org/pyeongchang-2018/results/en/general/competition-schedule.htm'

# Get document to be passed in for soup (better/cleaner practice)
request_schedule = requests.get(url_schedule)
text_schedule = request_schedule.text

# Get all sports from schedule page
soup_schedule = BeautifulSoup(text_schedule, 'html.parser')

In [None]:
# Get the image container and the name/link container (comes in pairs)
sports = soup_schedule.find_all('td', {'class':['disciplinePicture', 'styleLeft']})

# Dictonary for the sports
sports_info = {}

# Go every other since it always matches to one sport
for img, name in zip(sports[::2], sports[1::2]):
    
    # Skip for the ceremony image (and other errors)
    if name.a == None:
        continue
    
    
    # Get the image link which has the sports ID 
    # form: ../../resOWG2018/img/sports/CER.png
    sport_img_link = img.img['src']
    
    # Get ID from link
    match = re.search('(\w+)\.png$', sport_img_link)
    sport_id = match.group(1)
    
    # Get image as a link
    sport_img = '{}{}sports/{}.png'.format(dr_results, dr_imgs, sport_id)
    
    
    # Get sport's schedule page
    match = re.search('(([-\w]+)\/daily-schedule.htm)$', name.a['href'])    
    sport_schedule = '{}en/{}'.format(dr_results, match.group(1))

    # Get sport's full name from link (words separated by -)
    sport_name = match.group(2)
    
          
    sports_info[sport_id] = {'img': sport_img, 'schedule': sport_schedule, 'name': sport_name}

In [None]:
# Test data
print(len(sports_info))

for key, sport in sports_info.items():
    print(key)
    for k,v in sport.items():
        print(k, v)
    print()


### Get Country Data

#### Reference page to list of all countries

In [None]:
url_list_countries = 'https://www.olympic.org/pyeongchang-2018/results/en/general/nocs-list.htm'

# Get document to be passed in for soup (better/cleaner practice)
request_countries = requests.get(url_list_countries)
text_countires = request_countries.text

# Get all countries from main page
soup_countries = BeautifulSoup(text_countires, 'html.parser')
countries = soup_countries.find_all('div', class_='CountriesListItem')

#### Country pages, flags, IDs, etc. 

In [None]:
# Dictionary
countries_info = {}

# Iterate over countries and save info
for country in countries:
    # Country code gives an identifier of 3 character
    country_id = country['attrcountrycode']

    # Country web page
    country_page_link = country.a['href']
    match = re.search('\/([\w-]+-(\w+)\.htm)$',country_page_link)
    # group(1) form: noc-entries-country.htm
    country_page = '{}{}{}'.format(dr_results, dr_connector, match.group(1))
    
    # Country full name
    country_name = match.group(2) 

    # Flag image =>
    country_flag = '{}resCOMMON/img/flags/{}.png'.format(dr_results,country_id)
    
    countries_info[country_id] = {'name':country_name, 'page':country_page, 'flag':country_flag}

In [None]:
# Test data
print(len(countries_info))
print(countries_info['USA'])