In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

### Scrape All Olympic Games Participants

In [16]:
players = []
for i in range(1, 101):
    URL = f"https://www.olympedia.org/athletes/{i}"
    print(URL)
    page = requests.get(URL)

    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find("div", class_="athlete_bio")

    if results != None:
        table = results.find("table", class_="biodata")
        player_bio = {}
        player_bio["id"] = i
        if results.find("img", class_ = "photo"):
            player_bio["photo_link"] = results.find("img", class_ = "photo")["src"]
        for player in table.findAll("tr"):
            # print(player.find("th").text.replace(" ", "_"))
            # print(player.find("td").text.lstrip())
            player_bio[player.find("th").text.replace(" ", "_")] = player.find("td").text.lstrip()
            # print(i.find("td").find("a"))
        games = soup.find("table", class_="table").findAll("tr", class_="active")
        game_l = []
        for game in games:
            dict_cols = dict(zip(["game", "discipline", "noc"], [i.text.replace("\n", "") for i in game.findAll("td")[:3]]))
            game_l.append(dict_cols)
        player_bio["games"] = game_l
        
        players.append(player_bio)

with open("players.json", "w") as f:
    json.dump(players, f)

https://www.olympedia.org/athletes/1
https://www.olympedia.org/athletes/2
https://www.olympedia.org/athletes/3
https://www.olympedia.org/athletes/4
https://www.olympedia.org/athletes/5
https://www.olympedia.org/athletes/6
https://www.olympedia.org/athletes/7
https://www.olympedia.org/athletes/8
https://www.olympedia.org/athletes/9
https://www.olympedia.org/athletes/10
https://www.olympedia.org/athletes/11
https://www.olympedia.org/athletes/12
https://www.olympedia.org/athletes/13
https://www.olympedia.org/athletes/14
https://www.olympedia.org/athletes/15
https://www.olympedia.org/athletes/16
https://www.olympedia.org/athletes/17
https://www.olympedia.org/athletes/18
https://www.olympedia.org/athletes/19
https://www.olympedia.org/athletes/20
https://www.olympedia.org/athletes/21
https://www.olympedia.org/athletes/22
https://www.olympedia.org/athletes/23
https://www.olympedia.org/athletes/24
https://www.olympedia.org/athletes/25
https://www.olympedia.org/athletes/26
https://www.olympedia

Problem: There are about 150 000 players to scrape all this data we need to split it into threads but also send these threads to different computers

### Scrape only players who won medals

We need selenium to disable the limit of displayed players and to get all medal havers

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time

In [12]:
def get_medal_havers():
    URL = "https://www.olympedia.org/statistics/medal/athlete"
    page2 = requests.get(URL)

    soup2 = BeautifulSoup(page2.content, "html.parser")
    table = soup2.find("table", class_="table")
    
    col_names = [i.text for i in table.find("thead").findAll("th")]
    # print(col_names)

    # create webdriver object
    driver = webdriver.Firefox()
    driver.get("https://www.olympedia.org/statistics/medal/athlete")

    x = driver.find_element(By.NAME, 'limit')
    drop=Select(x)

    # select by visible text
    drop.select_by_visible_text("Show all")
    # wait for all the players to be displayed on the site
    time.sleep(20)

    players = driver.find_elements(By.CLASS_NAME, "top")
    out = [i.get_attribute("innerHTML") for i in players]

    driver.close()

    players_medal = []
    for line in out:
        soup = BeautifulSoup(line, "html.parser")
        players_medal.append([i.text.lstrip() for i in soup.findAll("td")])

    df = pd.DataFrame(players_medal, columns=col_names)
    df.to_csv("medal_havers.csv", index=False)

    return df

In [13]:
get_medal_havers()

Unnamed: 0,Athlete,Nat,Gold,Silver,Bronze,Total
0,Michael Phelps,USA,23,3,2,28
1,Larisa Latynina,URS UKR,9,5,4,18
2,Paavo Nurmi,FIN,9,3,0,12
3,Mark Spitz,USA,9,1,1,11
4,Carl Lewis,USA,9,1,0,10
...,...,...,...,...,...,...
1083,Vinzenz Geiger,GER,2,1,0,3
1084,Hanna Öberg,SWE,2,1,0,3
1085,Eileen Gu,CHN,2,1,0,3
1086,Daiki Hashimoto,JPN,2,1,0,3


## Try Threading to get all the players

In [3]:
import threading
import logging

In [4]:
def scrape_players(r: int):
    logging.info(f"Starting Thread in range {r} - {r + 100}")
    players = []
    for i in range(r, r+100):
        URL = f"https://www.olympedia.org/athletes/{str(i)}"
        # print(URL)
        page = requests.get(URL)

        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find("div", class_="athlete_bio")

        if results != None:
            table = results.find("table", class_="biodata")
            player_bio = {}
            player_bio["id"] = i
            if results.find("img", class_ = "photo"):
                player_bio["photo_link"] = results.find("img", class_ = "photo")["src"]
            for player in table.findAll("tr"):
                player_bio[player.find("th").text.replace(" ", "_")] = player.find("td").text.lstrip()
            games = soup.find("table", class_="table").findAll("tr", class_="active")
            game_l = []
            for game in games:
                dict_cols = dict(zip(["game", "discipline", "noc"], [i.text.replace("\n", "") for i in game.findAll("td")[:3]]))
                game_l.append(dict_cols)
            player_bio["games"] = game_l

            print(player_bio)
            
            players.append(player_bio)
    with open(f"players/players_{r}.json", "w") as f:
        json.dump(players, f)
    logging.info(f"Thread finished {r} - {r + 100}")
    return players

Inside each batch there is 1000 players and there are 150 batches, which is a lot and must be optimized

In [9]:
def thread_func(batch):
    thread_list = []
    for i in batch:
        thread = threading.Thread(target=scrape_players, args=(i, ))
        thread_list.append(thread)
        thread.start()
    for t in thread_list:
        t.join()

In [7]:
batches = []
for i in list(range(1, 150000, 1000)):
    batches.append([i] + [y*100 + i for y in range(1, 10)])

In [10]:
# for batch in batches:
#     thread_func(batch)

{'id': 601, 'Roles': 'Competed in Olympic Games', 'Sex': 'Female', 'Full_name': 'Wong-Lau•So Han', 'Used_name': 'Wong-Lau•So Han', 'Name_order': 'Oriental', 'Born': '25 April 1946', 'Measurements': '152 cm / 45 kg', 'NOC': 'Hong Kong, China', 'games': [{'game': '1984 Summer Olympics', 'discipline': 'Archery', 'noc': 'HKG'}]}
{'id': 901, 'Roles': 'Competed in Olympic Games', 'Sex': 'Female', 'Full_name': 'Lee•Jeong-Im', 'Used_name': 'Lee•Jeong-Im', 'Name_order': 'Oriental', 'Original_name': '이•정임', 'Other_names': 'Lee Jung-Im', 'Born': '1 July 1971', 'NOC': 'Republic of Korea', 'games': [{'game': '1992 Summer Olympics', 'discipline': 'Table Tennis', 'noc': 'KOR'}]}
{'id': 501, 'Roles': 'Competed in Olympic Games', 'Sex': 'Male', 'Full_name': 'Charles•Vallée', 'Used_name': 'Charles•Vallée', 'Born': '18 November 1861 in Neuilly-Saint-Front, Aisne (FRA)', 'Affiliations': 'Les Archers de Compiègne, Compiègne (FRA)', 'NOC': 'France', 'games': [{'game': '1908 Summer Olympics', 'discipline': '

: 