In [116]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from athletes import parse_athlete
from io import StringIO
import time
import random


def get_athlete_dict(soup):
    table = soup.find("table", {"class": "biodata"})
    df = pd.read_html(StringIO(str(table)), index_col=0, dtype_backend="pyarrow")[0]

    return df.T


def get_athlete_results(soup, id):
    table = soup.find("table", {"class": "table"})

    df = pd.read_html(StringIO(str(table)))[0]

    # Assigning new columns based on the row index (even or odd)
    df['Discipline (Sport)'] = df.loc[df.index % 2 == 0, 'Discipline (Sport) / Event']
    df['Event'] = df.loc[df.index % 2 == 1, 'Discipline (Sport) / Event']

    # Drop the original 'Discipline (Sport) / Event' column as it's now redundant
    df.drop(columns=['Discipline (Sport) / Event'], inplace=True)

    # Group the DataFrame by each pair of rows and apply forward fill within each group
    grouped = (df.groupby(df.index // 2, as_index=False)
            .apply(lambda x: x.ffill().bfill())
            .reset_index(drop=True))

    # Now, drop every odd row as each pair's information is consolidated into the even-indexed rows
    final_df = grouped[grouped.index % 2 == 0].reset_index(drop=True)

    columns = ['Games', 'NOC / Team', 'Pos', 'Medal', 'As', 'Discipline (Sport)', 'Event']
    return final_df[columns]
    

In [121]:
response = requests.get("https://www.olympedia.org/athletes/10000", timeout=60)
soup = BeautifulSoup(response.content, "html.parser")



In [122]:
table = get_athlete_results(soup)

table.head(10)

Unnamed: 0,Games,Discipline (Sport) / Event,NOC / Team,Pos,Medal,As,Unnamed: 6
0,1988 Summer Olympics,Canoe Sprint (Canoeing),GDR,,,André Wohllebe,
1,,"Kayak Singles, 1,000 metres, Men (Olympic)",,3.0,Bronze,,
2,,"Kayak Doubles, 500 metres, Men (Olympic)",Kay Bluhm,7.0,,,
3,,"Kayak Fours, 1,000 metres, Men (Olympic)",East Germany,3.0,Bronze,,
4,1992 Summer Olympics,Canoe Sprint (Canoeing),GER,,,André Wohllebe,
5,,"Kayak Fours, 1,000 metres, Men (Olympic)",Germany,1.0,Gold,,


In [103]:
# URL of the website to scrape
base_athlete_url = "https://www.olympedia.org/athletes"

SIZE = 10
columns = ['Roles', 'Sex', 'Full name', 'Used name', 'Born', 'Died', 'NOC']
output = pd.DataFrame(columns=columns)
results = pd.DataFrame()
for i in range(1,SIZE):
    print(i)
    time.sleep(random.random()*2)
    # Send a GET request to the website
    athlete_url = f"{base_athlete_url}/{i}"
    response = requests.get(athlete_url, timeout=60)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # TODO: Write your scraping logic here
        df = get_athlete_dict(soup)
        output = pd.concat([output,df])

        result = get_athlete_results(soup)
        results = pd.concat([results, result])

    else:
        print("Failed to retrieve the webpage. Status code:", response.status_code)

1


  output = pd.concat([output,df])
  .apply(lambda x: x.ffill().bfill())


2


  .apply(lambda x: x.ffill().bfill())


3


  .apply(lambda x: x.ffill().bfill())


4


  .apply(lambda x: x.ffill().bfill())


5


  .apply(lambda x: x.ffill().bfill())


6


  .apply(lambda x: x.ffill().bfill())


7


  .apply(lambda x: x.ffill().bfill())


8


  .apply(lambda x: x.ffill().bfill())


9


In [105]:
results.head()

Unnamed: 0,Games,NOC / Team,Pos,Medal,As,Discipline (Sport),Event
0,1912 Summer Olympics,FRA,=17,,Jean-François Blanchy,Tennis,"Singles, Men (Olympic)"
1,1920 Summer Olympics,Jean Montariol,DNS,,Jean-François Blanchy,"Doubles, Men (Olympic)",Tennis
2,,Jeanne Vaussard,=32,,,"Singles, Men (Olympic)","Doubles, Mixed (Olympic)"
3,,Jacques Brugnon,4,,,"Doubles, Men (Olympic)",
0,1996 Summer Olympics,FRA,=17,,Arnaud Boetsch,Tennis,"Singles, Men (Olympic)"


In [65]:
output.to_csv('sample_data.csv')

In [67]:
df = pd.read_csv('sample_data.csv', dtype_backend='pyarrow')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237 entries, 0 to 236
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype          
---  ------         --------------  -----          
 0   Unnamed: 0     237 non-null    int64[pyarrow] 
 1   Roles          237 non-null    string[pyarrow]
 2   Sex            237 non-null    string[pyarrow]
 3   Full name      237 non-null    string[pyarrow]
 4   Used name      237 non-null    string[pyarrow]
 5   Born           228 non-null    string[pyarrow]
 6   Died           96 non-null     string[pyarrow]
 7   NOC            237 non-null    string[pyarrow]
 8   Measurements   124 non-null    string[pyarrow]
 9   Affiliations   132 non-null    string[pyarrow]
 10  Nick/petnames  20 non-null     string[pyarrow]
 11  Title(s)       4 non-null      string[pyarrow]
 12  Other names    20 non-null     string[pyarrow]
 13  Nationality    4 non-null      string[pyarrow]
 14  Original name  31 non-null     string[pyarrow]
 15  Name o