In [2]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import math

#### Problems:
- Not all the althletes that are present in Results are registered in Athletes -> remove their names from Results
- Some atheletes that have had results are not in registered in Results -> no big deal
- Some atheltes that win Medals are not registered in Athletes -> `???`

In [63]:
athletes = pd.read_csv("data/olympic_athletes.csv", sep = ';')
hosts = pd.read_csv("data/olympic_hosts.csv", sep = ',')
medals = pd.read_csv("data/olympic_medals.csv", sep = ';')
results = pd.read_csv("data/olympic_results.csv", sep = ';')

In [64]:
def country_unifier(row):
    if row == "People's Republic of China":
        return 'China'
    if row == "Hong Kong, China":
        return 'Hong Kong'
    if row == 'United States':
        return 'United States of America'
    if row == 'Australia, Sweden':
        return 'Australia'
    if row == 'USSR':
        return 'Soviet Union'
    else:
        return row

## Nodes

### 1) Hosts

In [65]:
# Unify country names.
hosts.game_location = hosts.game_location.apply(lambda x: country_unifier(x))

# Save data in a csv file.
hosts.to_csv('csv/Hosts.csv', index = False, header=['slug', 'end_date', 'start_date', 'location', 'name', 'season', 'year'])

### 2) Countries

In [66]:
# Get only list of names without url from results.athletes.
def find_tuple_athl_url(row):
    if row is not np.nan:
        split_row = row.split("'")[1::2]
        tuple_list = []
        for name in split_row[::2]:
            tuple_list.append(name)
    else:
        return np.nan
    
    return tuple_list

# Remove urls and get list of names.
results.athletes = results.athletes.apply(lambda x: find_tuple_athl_url(x))

# Unify country names.
results.country_name = results.country_name.apply(lambda x: country_unifier(x))

In [67]:
# Create country instances.
countries = results[['country_name', 'country_code']].dropna().drop_duplicates()

# Save data in a csv file.
countries.to_csv('csv/Countries.csv', index = False, header=['name', 'code'])

### 3) Athletes

In [68]:
athletes.head(2)

Unnamed: 0,athlete_url,athlete_full_name,games_participations,first_game,athlete_year_birth,G,S,B
0,https://olympics.com/en/athletes/cooper-woods-...,Cooper WOODS-TOPALOVIC,1,Beijing 2022,2000.0,,,
1,https://olympics.com/en/athletes/elofsson,Felix ELOFSSON,2,PyeongChang 2018,1995.0,,,


In [69]:
def nan_to_0(row):
    if math.isnan(row):
        return 0
    else:
        return int(row)
    
def nan_to_NA(row):
    if type(row) == str:
        return row
    elif math.isnan(row):
        return 'NA'

def nan_to_NA_or_int(row):
    if math.isnan(row):
        return 'NA'
    elif isinstance(row, float):
        return int(row)

# Set nan of number of medals as 0.
athletes.G = athletes.G.apply(lambda x: nan_to_0(x))
athletes.S = athletes.S.apply(lambda x: nan_to_0(x))
athletes.B = athletes.B.apply(lambda x: nan_to_0(x))

# Replace nan with 'NA' and code as int athlete_year_birth.
athletes.first_game = athletes.first_game.apply(lambda x: nan_to_NA(x)) 
athletes.athlete_year_birth = athletes.athlete_year_birth.apply(lambda x: nan_to_NA_or_int(x)) 

In [70]:
# Save data in a csv file.
athletes.to_csv('csv/Athletes.csv', index = False, header=['url', 'name', 'partecipations', 'first_game', 'birth', 'G', 'S', 'B'])

## Edges

### 4) TAKE_PLACE_IN

In [71]:
# Take column to create edges.
TAKE_PLACE_IN = hosts[['game_slug', 'game_location']]

# Save in a csv file.
TAKE_PLACE_IN.to_csv('csv/TAKE_PLACE_IN.csv', index = False, header=['slug', 'location'])

### 5) NATIONALITY

We consider only athlete that are in the athlete table for the ones in `Results` and `Medals`

In [72]:
# Get the nationality of the athletes in the Athletes table.
nationality_df = results[['athletes', 'country_name']]
registered_athletes = set(athletes.athlete_full_name)

all_names = []
all_countries = []

# Split the athlete that partecipate at olympic game in a team.
for row in tqdm(nationality_df.iterrows()):
    country = row[1].country_name
    if isinstance(row[1].athletes, list):
        for name in row[1].athletes:
            if name in registered_athletes:
                all_names.append(name)
                all_countries.append(country)
    else:
        continue

dict_df = {'athlete':all_names, 'country':all_countries}
NATIONALITY = pd.DataFrame(dict_df)
NATIONALITY = NATIONALITY.drop_duplicates(ignore_index=True)

# Save in a csv file.
NATIONALITY.to_csv('csv/NATIONALITY.csv', index = False)

162804it [00:20, 7916.57it/s]


### 6) WIN_MEDAL_IN

In [73]:
# Unify country names.
medals.country_name = medals.country_name.apply(lambda x: country_unifier(x))

# Remove duplicate column (check country_name).
medals = medals.drop('participant_title', axis=1)

all_discipline, all_slug, all_event, all_event_gender, all_medal, all_partecipant_type, all_athlete = [], [], [], [], [], [], []

# Read all the rows.
for row in tqdm(medals.iterrows()):
    discipline, slug, event, event_gender, medal, partecipant_type, athlete, _, _ = row[1]
    all_discipline.append(discipline)
    all_slug.append(slug)
    all_event.append(event)
    all_event_gender.append(event_gender)
    all_medal.append(medal)
    all_partecipant_type.append(partecipant_type)
    all_athlete.append(athlete)

# Create new dataframe.
dict_df = {'athlete': all_athlete, 'discipline':all_discipline, 'slug':all_slug, 'event':all_event, 'event_gender':all_event_gender, 'medal':all_medal, 'partecipant_type':all_partecipant_type}

WIN_MEDAL_IN = pd.DataFrame(dict_df)
WIN_MEDAL_IN = WIN_MEDAL_IN.dropna()

# Save in a csv file.
WIN_MEDAL_IN.to_csv('csv/WIN_MEDAL_IN.csv', index = False)

21697it [00:01, 11128.41it/s]


### 7) PARTECIPATE

In [74]:
def clear_rank(row):
    if row == 'DNS':
        return 'NA'
    else:
        return row
    
new_results = results[results['athletes'].notna()]
new_results = new_results.drop('medal_type', axis=1)

results.rank_position = results.rank_position.apply(lambda x: clear_rank(x))


In [75]:
all_discipline, all_event, all_slug, all_partecipant_type, all_athlete, all_rank = [], [], [], [], [], []
registered_athletes = set(athletes.athlete_full_name)

# Read all the rows.
for row in tqdm(new_results.iterrows()):
    discipline, event_title, slug, partecipant_type, athlete, rank, _, _ = row[1]
    for name in athlete:
        if name in registered_athletes:
            all_discipline.append(discipline)
            all_event.append(event_title)
            all_slug.append(slug)
            all_partecipant_type.append(partecipant_type)
            all_athlete.append(name)
            all_rank.append(rank)
        else:
            continue

dict_df = {'athlete': all_athlete, 'discipline':all_discipline, 'slug':all_slug, 'event':all_event, 'partecipant_type':all_partecipant_type, 'rank':all_rank}

PARTECIPATE = pd.DataFrame(dict_df)

# Save in a csv file.
PARTECIPATE.to_csv('csv/PARTECIPATE.csv', index = False)

149622it [00:12, 12082.26it/s]


### 8) IN_TEAM_WITH

In [173]:
team_df = results[['discipline_title', 'slug_game', 'athletes', 'participant_type']]
team_df = team_df[team_df.participant_type == 'GameTeam']
team_df = team_df[team_df['athletes'].notna()]

In [174]:
all_discipline, all_slug, all_athlete1, all_athlete2 = [], [], [], []
registered_athletes = set(athletes.athlete_full_name)

# Read all the rows.
for row in tqdm(team_df.iterrows()):
    discipline, slug, athlete, _ = row[1]
    all_names_value =sum([name in registered_athletes for name in athlete])
    if all_names_value !=0:
        all_discipline.append(discipline)
        all_slug.append(slug)
        all_athlete1.append(athlete[0])
        all_athlete2.append(athlete[1])


    else:
        continue

dict_df = {'athlete_1': all_athlete1, 'athlete_2': all_athlete2, 'discipline':all_discipline, 'slug':all_slug}
IN_TEAM_WITH = pd.DataFrame(dict_df)

# Save in a csv file.
IN_TEAM_WITH.to_csv('csv/IN_TEAM_WITH.csv', index = False)

7976it [00:00, 29235.47it/s]
