# Predicción del resultado de los eventos de 100 metros libres masculino y femenino en los Juegos Olímpicos París 2024

In [1]:
import numpy as np
import pandas as pd
from utils import *
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import LeaveOneOut, GridSearchCV

df = pd.read_csv('data.csv')
df

Unnamed: 0,meet_name,swim_time,swim_date,full_desc,team_code,team_short_name,full_name_computed,gender,birth_date,event_id,standard_name,RANK,Rank_Order,fina_points,meet_city,country_code
0,Berlin Swim Open 2023,14:34.89,21/04/2023,Men 1500 Freestyle LCM Male,GER,Germany,"WELLBROCK, Florian",M,19/08/1997,,,1,1,986,Berlin,GER
1,Malmsten Swim Open Stockholm 2023,14:34.91,14/04/2023,Men 1500 Freestyle LCM Male,IRL,Ireland,"WIFFEN, Daniel",M,14/07/2001,,,2,2,986,Stockholm,SWE
2,32nd International Gothaer & Friends Meet,14:40.18,31/03/2023,Men 1500 Freestyle LCM Male,GER,Germany,"WELLBROCK, Florian",M,19/08/1997,,,3,3,969,Magedburg,GER
3,Malmsten Swim Open Stockholm 2023,14:40.21,14/04/2023,Men 1500 Freestyle LCM Male,UKR,Ukraine,"ROMANCHUK, Mykhailo",M,07/08/1996,,,4,4,969,Stockholm,SWE
4,Berlin Swim Open 2023,14:40.85,21/04/2023,Men 1500 Freestyle LCM Male,GER,Germany,"MARTENS, Lukas",M,27/12/2001,,,5,5,966,Berlin,GER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27995,Spanish Winter Open Swimming Championships,04:23.26,22/02/2024,Men 400 Medley LCM Male,ESP,Spain,"SANCHEZ SERRANO, Marcos",M,22/07/2006,,,196,196,781,Sabadell,ESP
27996,National Spring Age Group Swimming Championshi...,04:23.33,29/02/2024,Men 400 Medley LCM Male,CLB,Club,"HUNG, Pen Han",M,03/08/2002,,,197,197,780,Taichung,TPE
27997,Russian National Championships (50m),04:23.36,18/04/2024,Men 400 Medley LCM Male,CLB,Club,"GERKE, Sergey",M,14/07/2001,,,198,198,780,Kazan,RUS
27998,TYR Pro Swim Series San Antonio 2024,04:23.37,12/04/2024,Men 400 Medley LCM Male,CLB,Club,"BRIED, Tommy",M,05/06/2003,,,199,199,780,San Antonio,USA


In [31]:
# Se escogen todos los eventos del dataframe
events = df['full_desc'].unique()    

# Se convierten todas las fechas a formato dia/mes/año
df['swim_date'] = pd.to_datetime(df['swim_date'], format='%d/%m/%Y')    

# Se guardan los eventos de distancias largas (no tienen semifinales)
long_distance_events = ["Women 400 Freestyle LCM Female","Women 400 Medley LCM Female","Women 800 Freestyle LCM Female", "Men 400 Freestyle LCM Male","Men 400 Medley LCM Male","Men 1500 Freestyle LCM Male"]

# Se definen las columnas del dataframe con las que se trabajara
columns = ['event', 'place', 'name', 'country']

prediction = []

# Se definen 200 anchos de banda espaciados uniformemente entre 0.1 y 10.0
bandwidths = np.linspace(0.1, 10.0, 200)

def simulate(number_of_simulations, event):
    """"
    Simulamos <number_of_simulations> de veces el evento <event>
    """
    # Se escogen todas las marcas del evento especificado
    event_data = df[df['full_desc'] == event]
    
    # Se guardan los nombres de los atletas que tengan alguna marca en ese evento
    names = event_data['full_name_computed'].unique()
    competitors = []
    results = {}
    for name in names:
        # Se guarda toda la info del atleta en el evento
        athlete_data = event_data[event_data['full_name_computed'] == name]

        # Se crea el array de pesos del atleta con las fechas de cada marca
        weights = []
        for fecha in athlete_data['swim_date']:
            weights.append(date_to_value(fecha))

        # Se gurdan y estandarizan los tiempos del atleta
        athlete_times = athlete_data['swim_time'].apply(parse_time_to_seconds)
        athlete_times = athlete_times.apply(lambda mark: mark * (1 + 0.1/len(athlete_times)))
        athlete_times = athlete_times.values.reshape(-1, 1)
        
        # Se crea un diccionario con la cantidad de simulaciones en las que quedo en cada posicion
        results[name] = { 1:0,
                    2:0,
                    3:0,
                    4:0,
                    5:0,
                    6:0,
                    7:0,
                    8:0
                }
        
        # Se establece el cross-validation segun el numero de marcas registradas en el evento
        # y con eso se obtiene el mejor bandwidth para el kde
        if len(athlete_times) < 5:
            cv = LeaveOneOut()
        else:
            cv = 5

        if len(athlete_times) != 1:
            grid = GridSearchCV(KernelDensity(),
                        {'bandwidth': bandwidths},
                        cv=cv)  
            grid.fit(athlete_times)  
            best_bandwidth = grid.best_estimator_.bandwidth
        else:
            best_bandwidth = 0.2
        #! print(f"Mejor ancho de banda encontrado para el atleta {name} en el evento {event} es: {best_bandwidth}")
        
        # Se aplica kde con el bandwidth obtenido y el kenrel de tipo tophat
        kde = KernelDensity(bandwidth=best_bandwidth, kernel='tophat')
        
        # Se entrena el modelo con los tiempos de los atletas y la funcion de peso
        kde.fit(athlete_times, weights)

        # Se guarda cada atleta con su funcion de kde
        competitors.append((name,kde))

    for i in range(number_of_simulations):
        # Se simula la primera ronda del evento
        heats_results = race_simulation_kde(competitors)

        # Si el evento no es de distancia larga se simula la semifinal con los primeros 16 de la ronda anterior
        if event not in long_distance_events:
            semifinalists = heats_results[:16]
            semifinals_results = race_simulation_kde(semifinalists)
            finalists = semifinals_results[:8]
        else:
            finalists = heats_results[:8]

        # Se simula la final con los 8 primeros de la ronda anterior
        final_results = race_simulation_kde(finalists)

        # Se establece que atleta quedo en cada posicion y se guarda en su respectivo diccionario de posiciones
        for j in range(8):
            athlete = final_results[j][0]
            results[athlete][j + 1] +=1

    # Se guardan los resultados ordenados
    sorted_results = dict(sorted(results.items(), key=sort_key))
    
    # Se transforma a lista
    sorted_results_list = [(name, scores) for name, scores in sorted_results.items()]

    # Se imprimen los resultados
    print(f"En el evento {event}:")
    print(f"{str('Nombre del atleta').center(40)} | {str('1ro').center(5)} | {str('2do').center(5)} | {str('3ro').center(5)} | {str('4to').center(5)} | {str('5to').center(5)} | {str('6to').center(5)} | {str('7mo').center(5)} | {str('8vo').center(5)}")
    for athlete in sorted_results_list:
        if athlete[1][1] == 0:
            break
        print(f"{str(athlete[0]).center(40)} | {str(athlete[1][1]).center(5)} | {str(athlete[1][2]).center(5)} | {str(athlete[1][3]).center(5)} | {str(athlete[1][4]).center(5)} | {str(athlete[1][5]).center(5)} | {str(athlete[1][6]).center(5)} | {str(athlete[1][7]).center(5)} | {str(athlete[1][8]).center(5)}")
    print('\n')

    # Se guardan los resultados
    for i in range(8):
        prediction.append([event, f"{i+1}", sorted_results_list[i][0]])




In [32]:
# Se simula el evento de 100 metros libres masculino y femenino 1000 veces cada uno
for event in ('Men 100 Freestyle LCM Male', 'Women 100 Freestyle LCM Female'):
     simulate(1000, event)

En el evento Men 100 Freestyle LCM Male:
           Nombre del atleta             |  1ro  |  2do  |  3ro  |  4to  |  5to  |  6to  |  7mo  |  8vo 
              PAN, Zhanle                |  414  |  222  |  113  |   65  |   41  |   20  |   20  |   8  
            POPOVICI, David              |  299  |  193  |   96  |   68  |   41  |   35  |   20  |   24 
             CHALMERS, Kyle              |   90  |  115  |   90  |   73  |   62  |   45  |   48  |   35 
          MIRESSI, Alessandro            |   47  |   89  |   98  |   76  |   65  |   61  |   53  |   30 
           RICHARDS, Matthew             |   44  |   61  |   71  |   40  |   38  |   42  |   34  |   72 
            GROUSSET, Maxime             |   32  |   50  |   41  |   37  |   38  |   44  |   44  |   38 
             NEMETH, Nandor              |   30  |   72  |  104  |   96  |   66  |   64  |   52  |   34 
              ALEXY, Jack                |   16  |   27  |   25  |   31  |   32  |   38  |   33  |   57 
              