In [175]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from athletes import parse_athlete
from io import StringIO
import time
import random


def get_athlete_dict(soup, id):
    table = soup.find("table", {"class": "biodata"})
    df = pd.read_html(StringIO(str(table)), index_col=0, dtype_backend="pyarrow")[0]
    output_df = df.T
    output_df['athlete_id'] = id
    return output_df


def get_athlete_results(soup, id):
    table = soup.find("table", {"class": "table"})

    df = pd.read_html(StringIO(str(table)))[0]

    # Assigning new columns based on the row index (even or odd)
    df['athlete_id'] = id
    df['NOC'] = None
    df['Discipline'] = None
    
    rows_to_keep = df.index[df['Games'].isna()].tolist()
    rows_with_noc = df.index[~df['Games'].isna()].tolist()

    df.loc[rows_with_noc, 'NOC'] = df.loc[rows_with_noc, 'NOC / Team']
    df.loc[rows_with_noc, 'Discipline'] = df.loc[rows_with_noc, 'Discipline (Sport) / Event']

    columns_to_ffill = ['Games', 'NOC', 'As', 'Discipline']
    df[columns_to_ffill] = df[columns_to_ffill].ffill()

    # Rename 'Discipline (Sport) / Event' to 'Event' & 'NOC / Team' to 'Team' for clarity
    df.rename(columns={'Discipline (Sport) / Event': 'Event', 'NOC / Team': 'Team'}, inplace=True)

    # Drop the 'Unnamed: 6' column as it's not needed
    df.drop(columns=['Unnamed: 6'], inplace=True)

    # columns = ['Games', 'NOC / Team', 'Pos', 'Medal', 'As', 'Discipline (Sport)', 'Event']
    return df.iloc[rows_to_keep]
    

In [176]:
response = requests.get("https://www.olympedia.org/athletes/10000", timeout=60)
soup = BeautifulSoup(response.content, "html.parser")



In [177]:
table = get_athlete_results(soup, 10000)

table.head(10)

Unnamed: 0,Games,Event,Team,Pos,Medal,As,athlete_id,NOC,Discipline
1,1988 Summer Olympics,"Kayak Singles, 1,000 metres, Men (Olympic)",,3.0,Bronze,André Wohllebe,10000,GDR,Canoe Sprint (Canoeing)
2,1988 Summer Olympics,"Kayak Doubles, 500 metres, Men (Olympic)",Kay Bluhm,7.0,,André Wohllebe,10000,GDR,Canoe Sprint (Canoeing)
3,1988 Summer Olympics,"Kayak Fours, 1,000 metres, Men (Olympic)",East Germany,3.0,Bronze,André Wohllebe,10000,GDR,Canoe Sprint (Canoeing)
5,1992 Summer Olympics,"Kayak Fours, 1,000 metres, Men (Olympic)",Germany,1.0,Gold,André Wohllebe,10000,GER,Canoe Sprint (Canoeing)


In [178]:
# URL of the website to scrape
base_athlete_url = "https://www.olympedia.org/athletes"

SIZE = 10
columns = ['Roles', 'Sex', 'Full name', 'Used name', 'Born', 'Died', 'NOC', 'athlete_id']
output = pd.DataFrame(columns=columns)
results = pd.DataFrame()
errors = []
for i in range(1,SIZE):
    try:
        time.sleep(random.random()*2)
        # Send a GET request to the website
        athlete_url = f"{base_athlete_url}/{i}"
        response = requests.get(athlete_url, timeout=60)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, "html.parser")

            # TODO: Write your scraping logic here
            df = get_athlete_dict(soup, i)
            output = pd.concat([output if not output.empty else None,df])

            result = get_athlete_results(soup, i)
            results = pd.concat([results if not results.empty else None, result])

        else:
            print("Failed to retrieve the webpage. Status code:", response.status_code)
    except:
        errors.append(i)
        print(f"Error for index {i}")


In [179]:
results.head(100)

Unnamed: 0,Games,Event,Team,Pos,Medal,As,athlete_id,NOC,Discipline
1,1912 Summer Olympics,"Singles, Men (Olympic)",,=17,,Jean-François Blanchy,1,FRA,Tennis
2,1912 Summer Olympics,"Doubles, Men (Olympic)",Jean Montariol,DNS,,Jean-François Blanchy,1,FRA,Tennis
4,1920 Summer Olympics,"Singles, Men (Olympic)",,=32,,Jean-François Blanchy,1,FRA,Tennis
5,1920 Summer Olympics,"Doubles, Mixed (Olympic)",Jeanne Vaussard,=8,,Jean-François Blanchy,1,FRA,Tennis
6,1920 Summer Olympics,"Doubles, Men (Olympic)",Jacques Brugnon,4,,Jean-François Blanchy,1,FRA,Tennis
1,1996 Summer Olympics,"Singles, Men (Olympic)",,=17,,Arnaud Boetsch,2,FRA,Tennis
2,1996 Summer Olympics,"Doubles, Men (Olympic)",Guillaume Raoux,=17,,Arnaud Boetsch,2,FRA,Tennis
1,1924 Summer Olympics,"Singles, Men (Olympic)",,4,,Jean Borotra,3,FRA,Tennis
2,1924 Summer Olympics,"Doubles, Mixed (Olympic)",Marguerite Billout,=15,,Jean Borotra,3,FRA,Tennis
3,1924 Summer Olympics,"Doubles, Men (Olympic)",René Lacoste,3,Bronze,Jean Borotra,3,FRA,Tennis


In [180]:
output.head()

Unnamed: 0,Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames
1,Competed in Olympic Games,Male,"François Joseph Marie Antoine ""Jean-François""•...",Jean-François•Blanchy,"12 December 1886 in Bordeaux, Gironde (FRA)","2 October 1960 in Saint-Jean-de-Luz, Pyrénées-...",France,1,,,
1,Competed in Olympic Games,Male,Arnaud Benjamin•Boetsch,Arnaud•Boetsch,"1 April 1969 in Meulan, Yvelines (FRA)",,France,2,183 cm / 76 kg,"Racing Club de France, Paris (FRA)",
1,Competed in Olympic Games • Administrator,Male,Jean Laurent Robert•Borotra,Jean•Borotra,"13 August 1898 in Biarritz, Pyrénées-Atlantiqu...","17 July 1994 in Arbonne, Pyrénées-Atlantiques ...",France,3,183 cm / 76 kg,"TCP, Paris (FRA)",Le Basque Bondissant (The Bounding Basque)
1,Competed in Olympic Games,Male,Jacques Marie Stanislas Jean•Brugnon,Jacques•Brugnon,"11 May 1895 in Paris VIIIe, Paris (FRA)","20 March 1978 in Monaco, Monaco (MON)",France,4,168 cm / 64 kg,"Sporting club de Paris, Paris (FRA)",Toto
1,Competed in Olympic Games,Male,Henry Albert•Canet,Albert•Canet,"17 April 1878 in Wandsworth, England (GBR)","25 July 1930 in Paris VIIe, Paris (FRA)",France,5,,"TCP, Paris (FRA)",


In [65]:
output.to_csv('sample_data.csv')

In [67]:
df = pd.read_csv('sample_data.csv', dtype_backend='pyarrow')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237 entries, 0 to 236
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype          
---  ------         --------------  -----          
 0   Unnamed: 0     237 non-null    int64[pyarrow] 
 1   Roles          237 non-null    string[pyarrow]
 2   Sex            237 non-null    string[pyarrow]
 3   Full name      237 non-null    string[pyarrow]
 4   Used name      237 non-null    string[pyarrow]
 5   Born           228 non-null    string[pyarrow]
 6   Died           96 non-null     string[pyarrow]
 7   NOC            237 non-null    string[pyarrow]
 8   Measurements   124 non-null    string[pyarrow]
 9   Affiliations   132 non-null    string[pyarrow]
 10  Nick/petnames  20 non-null     string[pyarrow]
 11  Title(s)       4 non-null      string[pyarrow]
 12  Other names    20 non-null     string[pyarrow]
 13  Nationality    4 non-null      string[pyarrow]
 14  Original name  31 non-null     string[pyarrow]
 15  Name o