In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs
from alive_progress import alive_bar
import time

In [2]:
url = "https://paris-mpt.r.mikatiming.de/2024/?pid=search&pidp=tracking"

*Récupération des informations pour un participant, à partir de son numéro de dossard*

In [3]:
def get_results_by_bib_number(bib_number):
    """ pour obtenir les résultats d'un participant à partir de son numéro de dossard
    Sont récupérés : son nom, son n° de dossard, ses temps sur le 1er et le 2nd semi, ses temps totaux (net et officiel), ses temps intermédiaires tous les 5 kms."""
    res = requests.post(url,  {'search[start_no]': bib_number})
    soup = bs(res.content, 'html.parser')

    scr_data = {}
    scr_data['name'] = soup.find('td', class_='f-__fullname last').text.strip()
    scr_data['bib_number'] = soup.find('td', class_='f-start_no_text last').text.strip()
    scr_data['half_1st'] = soup.find('td', class_='f-time_06 last').text.strip()
    scr_data['half_second'] = soup.find('td', class_='f-time_19 last').text.strip()
    scr_data['time_total_netto'] = soup.find('td', class_='f-time_finish_netto last').text.strip()
    scr_data['time_total_brutto'] = soup.find('td', class_='f-time_finish_brutto last').text.strip()

    liste_classes = ['f-time_02', 'list-highlight f-time_03', 'f-time_04', 'list-highlight f-time_05', 'list-highlight f-time_07','f-time_08',
                    'list-highlight f-time_09']
    interm_time = ['5km', '10km', '15km', '20km','25km', '30km','35km']

    for name, classe in zip(interm_time, liste_classes):
        tr = soup.find('tr', class_=classe)
        scr_data[name] = tr.find('td', class_='time').text.strip()
        
    tr = soup.find_all('tr', class_='f-time_10')[0]
    scr_data['40km'] = tr.find('td', class_='time').text.strip()
    
    return scr_data

In [4]:
get_results_by_bib_number(15212)

{'name': 'GIACRI, Laurent',
 'bib_number': '15212',
 'half_1st': '02:24:36',
 'half_second': '02:58:18',
 'time_total_netto': '05:22:54',
 'time_total_brutto': '06:33:43',
 '5km': '00:32:30',
 '10km': '01:05:36',
 '15km': '01:39:24',
 '20km': '02:16:10',
 '25km': '02:51:50',
 '30km': '03:30:00',
 '35km': '04:08:07',
 '40km': '05:03:24'}

*Récupération des informations pour tous les participants (20024 participants)* 

In [5]:
total_nb = 21000
tot_results = []
with alive_bar(total_nb - 1, force_tty=True) as bar:
    for bib_number in np.arange(1,total_nb):
        try:
            tot_results.append(get_results_by_bib_number(bib_number))
        except:
            temp = {'name': 'NC', 'bib_number': bib_number, 'half_1st': 'NC', 'half_second': 'NC', 'time_total_netto': 'NC', 'time_total_brutto': 'NC', '5km': 'NC', '10km': 'NC', '15km': 'NC', '20km': 'NC', '25km': 'NC', '30km': 'NC', '35km': 'NC', '40km': 'NC'}
            tot_results.append(temp)
        bar()

|████████████████████████████████████████| 20999/20999 [100%] in 2:56:01.1 (1.99 ▂▂▄ 11/20999 [0%] in 7s (~3:53:29, 1. ▁▃▅ 291/20999 [1%] in 2:38 (~3:07:41, ▂▂▄ 308/20999 [1%] in 2:47 (~3:07:31,


In [7]:
#Transformation de la liste en un dataframe
df = pd.DataFrame(tot_results)
df.head(20)

Unnamed: 0,name,bib_number,half_1st,half_second,time_total_netto,time_total_brutto,5km,10km,15km,20km,25km,30km,35km,40km
0,"DURAND, Yohan",1,01:37:38,02:02:11,03:39:49,04:00:19,00:23:03,00:45:41,01:08:12,01:32:26,01:59:49,02:31:33,03:01:20,03:28:04
1,"LUIS, Vincent",2,–,–,–,–,–,–,–,–,–,–,–,–
2,NC,3,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC
3,NC,4,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC
4,NC,5,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC
5,NC,6,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC
6,"DURIEZ, Claude-Henri",7,02:06:12,02:40:57,04:47:08,05:07:38,00:26:36,00:53:22,01:24:31,01:57:51,02:34:24,03:14:10,03:49:16,04:31:21
7,"WARD, Jared",8,01:11:17,01:13:25,02:24:41,02:24:43,00:16:35,00:33:02,00:49:34,01:07:26,01:24:27,01:42:37,01:59:26,02:16:58
8,"HORN, Mike",9,02:17:07,02:56:18,05:13:24,05:33:56,00:28:02,00:58:55,01:30:55,02:08:44,02:46:06,03:28:06,04:10:21,04:55:04
9,NC,10,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC,NC


In [8]:
#Sauvegarde du dataframe
df.to_csv('./data/raw_data_marathon.csv')