# Scraping Script

## Import packages

In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.service import Service
import requests
from bs4 import BeautifulSoup
import time
import re
import string

## Specify Path

In [2]:
url = 'http://www.olympedia.org/results/19000437'

## Get 2021 results

In [3]:
dfs = pd.read_html('http://www.olympedia.org/results/19000437',encoding='utf8')

In [4]:
#Get the right dataframe
dat_2021 = dfs[1]
#Get rid of no-finishers
dat_2021_clean = dat_2021[dat_2021.Pos != "AC"][['Athlete', 'NOC', 'Time']]
#get time in a consistent format
hours = dat_2021_clean.Time.apply(lambda x: int(x[0]))
minutes = dat_2021_clean.Time.apply(lambda x: int(x[2:4]))
seconds = dat_2021_clean.Time.apply(lambda x: int(x[5:7]))
dat_2021_clean.Time = hours * 3600 + minutes * 60 + seconds
print(dat_2021_clean)

               Athlete  NOC  Time
0       Eliud Kipchoge  KEN  7718
1         Abdi Nageeye  NED  7798
2          Bashir Abdi  BEL  7800
3     Lawrence Cherono  KEN  7802
4       Ayad Lamdassem  ESP  7816
..                 ...  ...   ...
71          Cam Levins  CAN  8923
72        Yuma Hattori  JPN  9008
73   José Juan Esparza  MEX  9111
74  Jorge Castelblanco  PAN  9202
75          Iván Zarco  HON  9876

[76 rows x 3 columns]


## Get 2021 athlete data

In [5]:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [6]:
#Get athlete id's so that I can parse through the profile pages
athlete_id = []
i = 0
for a in soup.find_all('a', href=True):
    if a['href'][1:9] == 'athletes':
        athlete_id.append(a['href'][9:])
        i = i + 1

print(athlete_id)
athlete_id_clean = list(set(athlete_id[4:]))
print(athlete_id_clean)

['/advanced_search', '/doubles', '/lists', '', '/106584', '/135227', '/132653', '/144850', '/67392', '/69768', '/106584', '/135227', '/132653', '/144850', '/114976', '/134721', '/136037', '/114968', '/145178', '/132658', '/145176', '/142827', '/134570', '/133618', '/141947', '/142364', '/142824', '/145175', '/141817', '/144130', '/142465', '/142383', '/133433', '/132466', '/132961', '/134124', '/114615', '/143958', '/147332', '/143340', '/140468', '/141808', '/142470', '/135671', '/142382', '/135358', '/143959', '/145959', '/120054', '/135341', '/89097', '/145428', '/135426', '/146996', '/142815', '/141483', '/144146', '/141492', '/134922', '/143348', '/146995', '/146958', '/140469', '/124091', '/135173', '/145268', '/122508', '/134501', '/140992', '/135431', '/140878', '/144527', '/145960', '/145776', '/145269', '/132458', '/145131', '/133733', '/123438', '/142157', '/134504', '/120197', '/144540', '/145267', '/135424', '/147801', '/144844', '/145536', '/123446', '/142618', '/146504',

In [7]:
#parse through athlete profiles
ath_url = 'http://www.olympedia.org/athletes'
ath_list = []
for ath in athlete_id_clean:
    ath_list.append(pd.read_html(ath_url + ath, encoding='utf8'))

print(ath_list)

[[                         0
0          Internet search
1  Name Full News Original
2           English Hebrew,                0                          1
0           Type  Competed in Olympic Games
1            Sex                       Male
2      Full name        Marhu Abinet•Teferi
3      Used name               Marhu•Teferi
4  Original name                  טפרי•מארו
5           Born  17 August 1992 in ? (ETH)
6   Measurements             164 cm / 52 kg
7            NOC                     Israel,                   Games Discipline (Sport) / Event NOC / Team   Pos  Medal  \
0  2016 Summer Olympics                  Athletics        ISR   NaN    NaN   
1                   NaN    Marathon, Men (Olympic)        NaN  73.0    NaN   
2  2020 Summer Olympics                  Athletics        ISR   NaN    NaN   
3                   NaN    Marathon, Men (Olympic)        NaN  13.0    NaN   

             As  Unnamed: 6  
0  Marhu Teferi         NaN  
1           NaN         NaN  
2  Marhu Te

In [8]:
#Get athlete attribute table from the athlete profiles
ath_attr = []
for attr in ath_list:
    for tables in attr:
        if tables.iloc[0,0] == 'Type':
            tables.columns = ['attr', 'val']
            column_names = tables.attr.to_numpy()
            value_names = tables.val.to_numpy()
            new_table = pd.DataFrame(value_names.reshape(-1, len(value_names)), columns = column_names)
            ath_attr.append(new_table)


print(ath_attr)

[                        Type   Sex            Full name     Used name  \
0  Competed in Olympic Games  Male  Marhu Abinet•Teferi  Marhu•Teferi   

  Original name                       Born    Measurements     NOC  
0     טפרי•מארו  17 August 1992 in ? (ETH)  164 cm / 52 kg  Israel  ,                         Type   Sex     Full name     Used name  \
0  Competed in Olympic Games  Male  Goitom•Kifle  Goitom•Kifle   

              Born      NOC  
0  3 December 1993  Eritrea  ,                         Type   Sex       Full name       Used name  \
0  Competed in Olympic Games  Male  Dieter•Kersten  Dieter•Kersten   

              Born                       Affiliations      NOC  
0  25 October 1996  Atletiekclub De Demer [ADD] (BEL)  Belgium  ,                         Type   Sex       Full name       Used name  \
0  Competed in Olympic Games  Male  Richard•Ringer  Richard•Ringer   

                                                Born    Measurements  \
0  27 February 1989 in Überlingen,

In [9]:
#clean up the data
attr_table = pd.concat(ath_attr).reset_index()
attr_table = attr_table[['Sex', 'Used name', 'Born', 'NOC', 'Measurements']]
#Get birth elements
pattern = '\d+'
attr_table['day'] = attr_table.Born.apply(lambda x: int(re.findall(pattern, x)[0]))
attr_table['year'] = attr_table.Born.apply(lambda x: int(re.findall(pattern, x)[1]))
pattern2 = '\D+(?=\s)'
attr_table['month'] = attr_table.Born.apply(lambda x: re.findall(pattern2, x)[0])
attr_table['age'] = 2021 - attr_table.year
attr_table = attr_table.drop('Born', axis=1)
#Get the used name in correct format
attr_table['Used name'] = attr_table['Used name'].apply(lambda x: x.replace('•', ' '))
#Get height and weight in cm and kg
attr_table['height'] = np.nan
attr_table['weight'] = np.nan
pattern = ' cm'
pattern = '\d+'
for i in range(len(attr_table)):
    if attr_table.Measurements.notna()[i]:
        height_possible = attr_table.Measurements[i].split(' cm')
        if 'k' in attr_table.Measurements[i]:
            weight_possible = attr_table.Measurements[i]
            weight_possible = weight_possible[-5:weight_possible.index("k")]
            attr_table.weight[i] = weight_possible
        if len(height_possible) > 1:
            attr_table.height[i] = height_possible[0]  
attr_table.drop('Measurements', inplace = True, axis = 1)
print(attr_table)

      Sex          Used name           NOC  day  year       month  age  \
0    Male       Marhu Teferi        Israel   17  1992      August   29   
1    Male       Goitom Kifle       Eritrea    3  1993    December   28   
2    Male     Dieter Kersten       Belgium   25  1996     October   25   
3    Male     Richard Ringer       Germany   27  1989    February   32   
4    Male        Sisay Lemma      Ethiopia   12  1990    December   31   
..    ...                ...           ...  ...   ...         ...  ...   
103  Male   Marcin Chabowski        Poland   28  1986        June   35   
104  Male  José Luis Santana        Mexico   26  1989   September   32   
105  Male    Lemawork Ketema       Austria   22  1985     October   36   
106  Male     Pheeha Mokgobo  South Africa   23  1988    November   33   
107  Male       Paul Pollock       Ireland   25  1986        June   35   

     height  weight  
0     164.0    52.0  
1       NaN     NaN  
2       NaN     NaN  
3     182.0    62.0  
4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attr_table.weight[i] = weight_possible
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attr_table.height[i] = height_possible[0]


In [10]:
#write to csv
dat_2021_clean.to_csv('2021_results.csv', index = False)
attr_table.to_csv('runner_attributes_2021.csv', index = False)