In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import math

#### Problems:
- Not all the althletes that are present in Results are registered in Athletes -> remove their names from Results
- Some atheletes that have had results are not in registered in Results -> no big deal
- Some atheltes that win Medals are not registered in Athletes -> `???`

In [2]:
athletes = pd.read_csv("data/olympic_athletes.csv", sep = ';')
hosts = pd.read_csv("data/olympic_hosts.csv", sep = ',')
medals = pd.read_csv("data/olympic_medals.csv", sep = ';')
results = pd.read_csv("data/olympic_results.csv", sep = ';')

In [3]:
def country_unifier(row):
    if row == "People's Republic of China":
        return 'China'
    if row == "Hong Kong, China":
        return 'Hong Kong'
    if row == 'United States':
        return 'United States of America'
    if row == 'Australia, Sweden':
        return 'Australia'
    if row == 'USSR':
        return 'Soviet Union'
    else:
        return row

## Nodes

### 1) Hosts

In [4]:
# Unify country names.
hosts.game_location = hosts.game_location.apply(lambda x: country_unifier(x))

# Save data in a csv file.
hosts.to_csv('csv/Hosts.csv', index = False, header=['slug', 'end_date', 'start_date', 'location', 'name', 'season', 'year'])

### 2) Countries

In [5]:
# Get only list of names without url from results.athletes.
def find_tuple_athl_url(row):
    if row is not np.nan:
        split_row = row.split("'")[1::2]
        tuple_list = []
        for name in split_row[::2]:
            tuple_list.append(name)
    else:
        return np.nan
    
    return tuple_list

# Remove urls and get list of names.
results.athletes = results.athletes.apply(lambda x: find_tuple_athl_url(x))

# Unify country names.
results.country_name = results.country_name.apply(lambda x: country_unifier(x))

In [6]:
# Create country instances.
countries = results[['country_name', 'country_code']].dropna().drop_duplicates()

# Save data in a csv file.
countries.to_csv('csv/Countries.csv', index = False, header=['name', 'code'])

### 3) Athletes

In [7]:
athletes.head(2)

Unnamed: 0,athlete_url,athlete_full_name,games_participations,first_game,athlete_year_birth,G,S,B
0,https://olympics.com/en/athletes/cooper-woods-...,Cooper WOODS-TOPALOVIC,1,Beijing 2022,2000.0,,,
1,https://olympics.com/en/athletes/elofsson,Felix ELOFSSON,2,PyeongChang 2018,1995.0,,,


In [8]:
def nan_to_0(row):
    if math.isnan(row):
        return 0
    else:
        return int(row)
    
def nan_to_NA(row):
    if type(row) == str:
        return row
    elif math.isnan(row):
        return 'NA'

def nan_to_NA_or_int(row):
    if math.isnan(row):
        return 'NA'
    elif isinstance(row, float):
        return int(row)

# Set nan of number of medals as 0.
athletes.G = athletes.G.apply(lambda x: nan_to_0(x))
athletes.S = athletes.S.apply(lambda x: nan_to_0(x))
athletes.B = athletes.B.apply(lambda x: nan_to_0(x))

# Replace nan with 'NA' and code as int athlete_year_birth.
athletes.first_game = athletes.first_game.apply(lambda x: nan_to_NA(x)) 
athletes.athlete_year_birth = athletes.athlete_year_birth.apply(lambda x: nan_to_NA_or_int(x)) 

In [9]:
# Save data in a csv file.
athletes.to_csv('csv/Athletes.csv', index = False, header=['url', 'name', 'partecipations', 'first_game', 'birth', 'G', 'S', 'B'])

## Edges

### 4) TAKE_PLACE_IN

In [10]:
# Take column to create edges.
TAKE_PLACE_IN = hosts[['game_slug', 'game_location']]

# Save in a csv file.
TAKE_PLACE_IN.to_csv('csv/TAKE_PLACE_IN.csv', index = False, header=['slug', 'location'])

### 5) NATIONALITY

We consider only athlete that are in the athlete table for the ones in `Results` and `Medals`

In [72]:
results

Unnamed: 0,discipline_title,event_title,slug_game,participant_type,medal_type,athletes,rank_position,country_name,country_code
0,Curling,Mixed Doubles,beijing-2022,GameTeam,GOLD,"[Stefania CONSTANTINI, Amos MOSANER]",1,Italy,ITA
1,Curling,Mixed Doubles,beijing-2022,GameTeam,SILVER,"[Kristin SKASLIEN, Magnus NEDREGOTTEN]",2,Norway,NOR
2,Curling,Mixed Doubles,beijing-2022,GameTeam,BRONZE,"[Almida DE VAL, Oskar ERIKSSON]",3,Sweden,SWE
3,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[Jennifer DODDS, Bruce MOUAT]",4,Great Britain,GBR
4,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[Rachel HOMAN, John MORRIS]",5,Canada,CAN
...,...,...,...,...,...,...,...,...,...
162799,Ski Jumping,Normal Hill Individual men,chamonix-1924,Athlete,,[Franciszek BUJAK],,Poland,POL
162800,Ski Jumping,Normal Hill Individual men,chamonix-1924,Athlete,,[Henryk Mückenbrunn],,Poland,POL
162801,Ski Jumping,Normal Hill Individual men,chamonix-1924,Athlete,,[Milda Prokopec],,Czechoslovakia,TCH
162802,Ski Jumping,Normal Hill Individual men,chamonix-1924,Athlete,,[Sigurd Overby],,United States of America,USA


In [17]:
# Get the nationality of the athletes in the Athletes table.
nationality_df = results[['athletes', 'country_name']]
registered_athletes = set(athletes.athlete_full_name)

all_names = []
all_countries = []

# Split the athlete that partecipate at olympic game in a team.
for row in tqdm(nationality_df.iterrows()):
    country = row[1].country_name
    if isinstance(row[1].athletes, list):
        for name in row[1].athletes:
            if name in registered_athletes:
                all_names.append(name)
                all_countries.append(country)
    else:
        continue

dict_df = {'athlete':all_names, 'country':all_countries}
NATIONALITY = pd.DataFrame(dict_df)
NATIONALITY = NATIONALITY.drop_duplicates(ignore_index=True)

# Save in a csv file.
NATIONALITY.to_csv('csv/NATIONALITY.csv', index = False)

162804it [00:10, 14949.96it/s]


In [71]:
for row in nationality_df.iterrows():
    if type(row[1][0]) == list:
        if row[1][0][0] == 'Amy PIETERS':
            print(row)

(6248, athletes        [Amy PIETERS, Kirsten WILD]
country_name                    Netherlands
Name: 6248, dtype: object)
(6249, athletes        [Amy PIETERS, Kirsten WILD]
country_name                         France
Name: 6249, dtype: object)
(6250, athletes        [Amy PIETERS, Kirsten WILD]
country_name                         Poland
Name: 6250, dtype: object)
(6251, athletes        [Amy PIETERS, Kirsten WILD]
country_name                      Australia
Name: 6251, dtype: object)
(6252, athletes        [Amy PIETERS, Kirsten WILD]
country_name                          Italy
Name: 6252, dtype: object)
(6253, athletes        [Amy PIETERS, Kirsten WILD]
country_name       United States of America
Name: 6253, dtype: object)
(6254, athletes        [Amy PIETERS, Kirsten WILD]
country_name                        Belgium
Name: 6254, dtype: object)
(6255, athletes        [Amy PIETERS, Kirsten WILD]
country_name                    New Zealand
Name: 6255, dtype: object)
(6256, athletes        [

In [48]:
results.athletes[0]
a = results
a.athletes = list(map(a.athletes, str))

TypeError: 'type' object is not iterable

In [49]:
results[results.athletes=='Amy PIETERS']

Unnamed: 0,discipline_title,event_title,slug_game,participant_type,medal_type,athletes,rank_position,country_name,country_code


In [19]:
athletes

Unnamed: 0,athlete_url,athlete_full_name,games_participations,first_game,athlete_year_birth,G,S,B
0,https://olympics.com/en/athletes/cooper-woods-...,Cooper WOODS-TOPALOVIC,1,Beijing 2022,2000,0,0,0
1,https://olympics.com/en/athletes/elofsson,Felix ELOFSSON,2,PyeongChang 2018,1995,0,0,0
2,https://olympics.com/en/athletes/dylan-walczyk,Dylan WALCZYK,1,Beijing 2022,1993,0,0,0
3,https://olympics.com/en/athletes/olli-penttala,Olli PENTTALA,1,Beijing 2022,1995,0,0,0
4,https://olympics.com/en/athletes/reikherd,Dmitriy REIKHERD,1,Beijing 2022,1989,0,0,0
...,...,...,...,...,...,...,...,...
75899,https://olympics.com/en/athletes/douglas-weigle,Douglas WEIGLE,1,Innsbruck 1976,1955,0,0,0
75900,https://olympics.com/en/athletes/stefania-bertele,Stefania BERTELE,1,Innsbruck 1976,1957,0,0,0
75901,https://olympics.com/en/athletes/walter-cecconi,Walter CECCONI,1,Innsbruck 1976,1957,0,0,0
75902,https://olympics.com/en/athletes/susan-kelley,Susan KELLEY,1,Innsbruck 1976,1954,0,0,0


In [34]:
a = NATIONALITY.groupby('athlete').count()

In [39]:
NATIONALITY[NATIONALITY.athlete=='Amy PIETERS']

Unnamed: 0,athlete,country
4069,Amy PIETERS,Netherlands
4071,Amy PIETERS,France
4073,Amy PIETERS,Poland
4075,Amy PIETERS,Australia
4077,Amy PIETERS,Italy
4079,Amy PIETERS,United States of America
4081,Amy PIETERS,Belgium
4083,Amy PIETERS,New Zealand
4085,Amy PIETERS,Germany
4087,Amy PIETERS,Ireland


In [38]:
a[a.country==12]

Unnamed: 0_level_0,country
athlete,Unnamed: 1_level_1
Amy PIETERS,12
Kirsten WILD,12


### 6) WIN_MEDAL_IN

In [12]:
# Unify country names.
medals.country_name = medals.country_name.apply(lambda x: country_unifier(x))

# Remove duplicate column (check country_name).
medals = medals.drop('participant_title', axis=1)

all_discipline, all_slug, all_event, all_event_gender, all_medal, all_partecipant_type, all_athlete = [], [], [], [], [], [], []

# Read all the rows.
for row in tqdm(medals.iterrows()):
    discipline, slug, event, event_gender, medal, partecipant_type, athlete, _, _ = row[1]
    all_discipline.append(discipline)
    all_slug.append(slug)
    all_event.append(event)
    all_event_gender.append(event_gender)
    all_medal.append(medal)
    all_partecipant_type.append(partecipant_type)
    all_athlete.append(athlete)

# Create new dataframe.
dict_df = {'athlete': all_athlete, 'discipline':all_discipline, 'slug':all_slug, 'event':all_event, 'event_gender':all_event_gender, 'medal':all_medal, 'partecipant_type':all_partecipant_type}

WIN_MEDAL_IN = pd.DataFrame(dict_df)
WIN_MEDAL_IN = WIN_MEDAL_IN.dropna()

# Save in a csv file.
WIN_MEDAL_IN.to_csv('csv/WIN_MEDAL_IN.csv', index = False)

21697it [00:02, 8793.92it/s]


### 7) PARTECIPATE

In [13]:
def clear_rank(row):
    if row == 'DNS':
        return 'NA'
    else:
        return row
    
new_results = results[results['athletes'].notna()]
new_results = new_results.drop('medal_type', axis=1)

results.rank_position = results.rank_position.apply(lambda x: clear_rank(x))


In [14]:
all_discipline, all_event, all_slug, all_partecipant_type, all_athlete, all_rank = [], [], [], [], [], []
registered_athletes = set(athletes.athlete_full_name)

# Read all the rows.
for row in tqdm(new_results.iterrows()):
    discipline, event_title, slug, partecipant_type, athlete, rank, _, _ = row[1]
    for name in athlete:
        if name in registered_athletes:
            all_discipline.append(discipline)
            all_event.append(event_title)
            all_slug.append(slug)
            all_partecipant_type.append(partecipant_type)
            all_athlete.append(name)
            all_rank.append(rank)
        else:
            continue

dict_df = {'athlete': all_athlete, 'discipline':all_discipline, 'slug':all_slug, 'event':all_event, 'partecipant_type':all_partecipant_type, 'rank':all_rank}

PARTECIPATE = pd.DataFrame(dict_df)

# Save in a csv file.
PARTECIPATE.to_csv('csv/PARTECIPATE.csv', index = False)

149622it [00:19, 7511.62it/s]


### 8) IN_TEAM_WITH

In [15]:
team_df = results[['discipline_title', 'slug_game', 'athletes', 'participant_type']]
team_df = team_df[team_df.participant_type == 'GameTeam']
team_df = team_df[team_df['athletes'].notna()]

In [16]:
all_discipline, all_slug, all_athlete1, all_athlete2 = [], [], [], []
registered_athletes = set(athletes.athlete_full_name)

# Read all the rows.
for row in tqdm(team_df.iterrows()):
    discipline, slug, athlete, _ = row[1]
    all_names_value =sum([name in registered_athletes for name in athlete])
    if all_names_value !=0:
        all_discipline.append(discipline)
        all_slug.append(slug)
        all_athlete1.append(athlete[0])
        all_athlete2.append(athlete[1])


    else:
        continue

dict_df = {'athlete_1': all_athlete1, 'athlete_2': all_athlete2, 'discipline':all_discipline, 'slug':all_slug}
IN_TEAM_WITH = pd.DataFrame(dict_df)

# Save in a csv file.
IN_TEAM_WITH.to_csv('csv/IN_TEAM_WITH.csv', index = False)

7976it [00:01, 7285.03it/s]
