In [1]:
import pandas as pd
import numpy as np
import math

## Athletes dataframe

In [4]:
athletes = pd.read_csv("data/olympic_athletes.csv", sep = ';')

In [13]:
def nan_to_0(row):
    if math.isnan(row):
        return 0
    else:
        return row
    
def nan_to_NA(row):
    if type(row) == str:
        return row
    elif math.isnan(row):
        return 'NA'

def nan_to_NA_or_int(row):
    if math.isnan(row):
        return 'NA'
    elif isinstance(row, float):
        return int(row)


athletes.G = athletes.G.apply(lambda x: nan_to_0(x))
athletes.S = athletes.S.apply(lambda x: nan_to_0(x))
athletes.B = athletes.B.apply(lambda x: nan_to_0(x))

athletes.first_game = athletes.first_game.apply(lambda x: nan_to_NA(x)) 
athletes.athlete_year_birth = athletes.athlete_year_birth.apply(lambda x: nan_to_NA_or_int(x)) 

In [17]:
with open('insert_athletes.txt', 'w', encoding="utf-8") as f:
    for row in athletes.iterrows():
        url, name, partecipations, first_game, birth, G, S, B = row[1]
        attrs = f'''name:"{name}", url:'{url}', games_participations:{partecipations}'''
        if first_game != 'NA':
            attrs += f", first_game:'{first_game}'"
        if birth != 'NA':
            attrs += f", birth_year:{birth}"
        if G !=0:
            attrs += f", gold:{int(G)}"
        if S !=0:
            attrs += f", silver:{int(S)}"
        if B !=0:
            attrs += f", bronze:{int(B)}"
        new_row = f'''CREATE (:Athlete \u007B{attrs}\u007D)\n'''
    
        f.write(new_row)

## Hosts dataframe

In [18]:
hosts = pd.read_csv("data/olympic_hosts.csv", sep = ',')

In [60]:
with open('insert_hosts.txt', 'w', encoding="utf-8") as f:
    for row in hosts.iterrows():
        slug, end_date, start_date, location, name, season, year = row[1]
        new_row = f'''CREATE (:Host \u007B{f"game_slug:'{slug}', end_date:{end_date}, start_date:{start_date}, name:'{name}', season:'{season}', year:{year}"}\u007D)\n'''
        f.write(new_row)

## Medals dataframe

In [22]:
medals = pd.read_csv("data/olympic_medals.csv", sep = ';')

In [62]:
medals

Unnamed: 0,discipline_title,slug_game,event_title,event_gender,medal_type,participant_type,participant_title,athlete,country_name,country_code
0,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,GameTeam,Italy,Stefania CONSTANTINI,Italy,ITA
1,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,GameTeam,Italy,Amos MOSANER,Italy,ITA
2,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,GameTeam,Norway,Kristin SKASLIEN,Norway,NOR
3,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,GameTeam,Norway,Magnus NEDREGOTTEN,Norway,NOR
4,Curling,beijing-2022,Mixed Doubles,Mixed,BRONZE,GameTeam,Sweden,Almida DE VAL,Sweden,SWE
...,...,...,...,...,...,...,...,...,...,...
21692,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,SILVER,Athlete,,Viggo JENSEN,Denmark,DEN
21693,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,BRONZE,Athlete,,Alexandros Nikolopoulos,Greece,GRE
21694,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,GOLD,Athlete,,Viggo JENSEN,Denmark,DEN
21695,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,SILVER,Athlete,,Launceston ELLIOT,Great Britain,GBR


In [23]:
medals.isna().any()

discipline_title     False
slug_game            False
event_title          False
event_gender         False
medal_type           False
participant_type     False
participant_title     True
athlete               True
country_name          True
country_code          True
dtype: bool

In [56]:
# Create country instances.
countries = medals[['country_name', 'country_code']].dropna().drop_duplicates()

with open('insert_countries.txt', 'w', encoding="utf-8") as f:
    for row in countries.iterrows():
        name, code = row[1]
        attrs = f'''name:"{name}", code:"{code}"'''
        new_row = f'''CREATE (:Country \u007B{attrs}\u007D)\n'''
        f.write(new_row)

In [None]:
# Create event_title instances.

events = 

In [23]:

# create discipline_title instance
# create event_title instance

medals

Unnamed: 0,discipline_title,slug_game,event_title,event_gender,medal_type,participant_type,participant_title,athlete,country_name,country_code
0,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,GameTeam,Italy,Stefania CONSTANTINI,Italy,ITA
1,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,GameTeam,Italy,Amos MOSANER,Italy,ITA
2,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,GameTeam,Norway,Kristin SKASLIEN,Norway,NOR
3,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,GameTeam,Norway,Magnus NEDREGOTTEN,Norway,NOR
4,Curling,beijing-2022,Mixed Doubles,Mixed,BRONZE,GameTeam,Sweden,Almida DE VAL,Sweden,SWE
...,...,...,...,...,...,...,...,...,...,...
21692,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,SILVER,Athlete,,Viggo JENSEN,Denmark,DEN
21693,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,BRONZE,Athlete,,Alexandros Nikolopoulos,Greece,GRE
21694,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,GOLD,Athlete,,Viggo JENSEN,Denmark,DEN
21695,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,SILVER,Athlete,,Launceston ELLIOT,Great Britain,GBR


## Results dataframe

In [20]:
results = pd.read_csv("data/olympic_results.csv", sep = ';')

In [57]:
results

Unnamed: 0,discipline_title,event_title,slug_game,participant_type,medal_type,athletes,rank_position,country_name,country_code
0,Curling,Mixed Doubles,beijing-2022,GameTeam,GOLD,"[('Stefania CONSTANTINI', 'https://olympics.co...",1,Italy,ITA
1,Curling,Mixed Doubles,beijing-2022,GameTeam,SILVER,"[('Kristin SKASLIEN', 'https://olympics.com/en...",2,Norway,NOR
2,Curling,Mixed Doubles,beijing-2022,GameTeam,BRONZE,"[('Almida DE VAL', 'https://olympics.com/en/at...",3,Sweden,SWE
3,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Jennifer DODDS', 'https://olympics.com/en/a...",4,Great Britain,GBR
4,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Rachel HOMAN', 'https://olympics.com/en/ath...",5,Canada,CAN
...,...,...,...,...,...,...,...,...,...
162799,Ski Jumping,Normal Hill Individual men,chamonix-1924,Athlete,,"[('Franciszek BUJAK','https://olympics.com/en/...",DNS,Poland,POL
162800,Ski Jumping,Normal Hill Individual men,chamonix-1924,Athlete,,"[('Henryk Mückenbrunn','nan')]",DNS,Poland,POL
162801,Ski Jumping,Normal Hill Individual men,chamonix-1924,Athlete,,"[('Milda Prokopec','nan')]",DNS,Czechoslovakia,TCH
162802,Ski Jumping,Normal Hill Individual men,chamonix-1924,Athlete,,"[('Sigurd Overby','nan')]",DNS,United States of America,USA


In [3]:
# Function to get name and url of each athlete as a list in results.athletes.
def find_tuple_athl_url(row):
    if row is not np.nan:
        split_row = row.split("'")[1::2]
        tuple_list = []
        for name,  url in zip(split_row[::2], split_row[1::2]):
            tuple_list.append((name, url))
    else:
        return np.nan
    
    return tuple_list


results.athletes = results.athletes.apply(lambda x: find_tuple_athl_url(x))