In [14]:
import os
import pandas as pd
import csv
from io import StringIO
import gdown

In [15]:
def get_event_type(event):
    event = event.strip('()').upper()

    if event == 'GP':
        return 'Grand Prix'
    elif event == 'JO':
        return 'Olympics'
    elif event == 'SA':
        return 'Satellite'
    elif event == 'A':
        return 'World Cup'
    elif event == 'CHZ':
        return 'Zonal Championship'
    elif event == 'CHM':
        return 'World Championship'
    else:
        raise ValueError(f'Unknown event type: {event}')


def get_tournament_data(tournament_data):
    tournament_split = tournament_data.split()
    date = tournament_split[0]
    event_name = ' '.join(tournament_split[1:-1])
    event_type = get_event_type(tournament_split[-1])

    return date, event_name, event_type

### Iterate through each data file

In [16]:
current_dir = os.getcwd()
results_dir = os.path.join(current_dir, 'results')

if not os.listdir(results_dir):
    # Download from Google Drive
    gdown.download_folder(
        url='https://drive.google.com/drive/folders/1rkwkF7DOcJhgIw_KXJSV9w4qJN2rq9GT',
        output=results_dir,
        quiet=False,
        use_cookies=False
    )
else:
    data = []
    for results in os.listdir(results_dir):
        results_path = os.path.join(results_dir, results)
        if results_path.endswith('csv'):
            with open(results_path, 'r') as file:
                lines = file.readlines()

                tournaments = lines[3].strip()
                reader = csv.reader(StringIO(tournaments))
                tournaments = list(reader)[0][2:-1]

                for line in lines[4:504]:
                    line = line.strip()
                    columns = line.split(',')
                    current_rank = columns[0]
                    fencer_name = columns[1]
                    nationality = columns[2]
                
                    for i, tournament in enumerate(tournaments):
                        points_index = 3 + i # points start at column 3
                        points = columns[points_index]

                        if points:
                            points = abs(float(points))
                            date, event_name, event_type = get_tournament_data(tournament)
                            data.append([current_rank, fencer_name, nationality, event_name, event_type, points, date])

Retrieving folder contents


Processing file 1Tcswef41fcXjbH6Rm7oLMW9YER_HwmqY Results-2008.csv
Processing file 1rDqvrEGm4TTg6bcmwfPd2VUUUGbT7Ltx Results-2009.csv
Processing file 1XQw7BC8ijU5ekZx98TKfGCq3zphZmisp Results-2010.csv
Processing file 1VBWvM3JITrLIkFcPopaMPDU8dg_usVmX Results-2011.csv
Processing file 1r62MNqR18L6fkpwRFbb0DQelWx41UVKG Results-2012.csv
Processing file 1LohLS7E8-uBnnp7Gb7fkKUWhjKvVIdCk Results-2013.csv
Processing file 11QfnOueUust-k2DyquM6VO81WzN9Mp4N Results-2014.csv
Processing file 1HhLiLXRNTe9Fd9Kkw4CY4cmOpu-_CWKS Results-2015.csv
Processing file 1SatFM8NRaHtlZWgv4HBQfsN6aRY5ZzLG Results-2016.csv
Processing file 1_2Md_SnSwSXnI-78oRj4cb7vxHwIBanH Results-2017.csv
Processing file 1GzUVCyMswo7Nd1f_27ZJ2t9WafrZJ62U Results-2018.csv
Processing file 1DAbK6wEnciTNpu5_7kfbX3tzvoR4Bvov Results-2019.csv
Processing file 1tpIG8AOrhZuxBUmxlmp-8W1AyAB0ALcv Results-2020.csv
Processing file 15OdjarsqgRnEDSpurMO5hmmxfhLcRUZd Results-2021.csv
Processing file 1PjYQgdnDlNd72SAv5rdeJ7G49mzMjhbk Results-2022

Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1Tcswef41fcXjbH6Rm7oLMW9YER_HwmqY
To: /Users/joe/projects/Fencing-Score-Predictor/results/Results-2008.csv
100%|██████████| 31.7k/31.7k [00:00<00:00, 29.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1rDqvrEGm4TTg6bcmwfPd2VUUUGbT7Ltx
To: /Users/joe/projects/Fencing-Score-Predictor/results/Results-2009.csv
100%|██████████| 30.2k/30.2k [00:00<00:00, 1.44MB/s]
Downloading...
From: https://drive.google.com/uc?id=1XQw7BC8ijU5ekZx98TKfGCq3zphZmisp
To: /Users/joe/projects/Fencing-Score-Predictor/results/Results-2010.csv
100%|██████████| 34.6k/34.6k [00:00<00:00, 3.23MB/s]
Downloading...
From: https://drive.google.com/uc?id=1VBWvM3JITrLIkFcPopaMPDU8dg_usVmX
To: /Users/joe/projects/Fencing-Score-Predictor/results/Results-2011.csv
100%|██████████| 27.1k/27.1k [00:00<00:00, 1.45MB/s]
Downloading...
From: https://drive.google.com/u

### Store data to Panda dataframe

In [13]:
df = pd.DataFrame(data, columns=['current_rank', 'fencer_name', 'fencer_nationality', 'event_name', 'event_type', 'points', 'date'])
df = df.drop_duplicates(subset=['fencer_name', 'event_name', 'date'], keep='last')
df['date'] = pd.to_datetime(df['date'])
df.to_csv('data.csv', index=False)
df

Unnamed: 0,current_rank,fencer_name,fencer_nationality,event_name,event_type,points,date
0,1,MASSIALAS Alexander,USA,Bonn,World Cup,2.0,2022-11-11
1,1,MASSIALAS Alexander,USA,Tokyo,World Cup,14.0,2022-09-12
2,1,MASSIALAS Alexander,USA,Paris,World Cup,32.0,2023-12-01
3,1,MASSIALAS Alexander,USA,Turin,Grand Prix,21.0,2023-11-02
4,1,MASSIALAS Alexander,USA,Cairo,World Cup,32.0,2023-02-23
...,...,...,...,...,...,...,...
33946,500,WILLIAMS Dean,AUS,Venise,Grand Prix,0.0,2012-03-03
33947,501,CHU David,GBR,Londres,Satellite,0.0,2011-04-12
33948,502,PARK Guy Meen,KOR,Séoul,World Cup,0.0,2012-05-18
33949,503,MANSON James,GBR,Londres,Satellite,0.0,2011-04-12
