In [152]:
import os
import pandas as pd
import csv
from io import StringIO
import gdown
from bs4 import BeautifulSoup

In [153]:
# Convert event abbreviation to full text for readability
def get_event_type(event):
    event = event.strip('()').upper()

    EVENT_TYPES = {
        'GP': 'Grand Prix',
        'JO': 'Olympics',
        'SA': 'Satellite',
        'A': 'World Cup',
        'CHZ': 'Zonal Championship',
        'CHM': 'World Championship'
    }

    if event in EVENT_TYPES:
        return EVENT_TYPES[event]
    else:
        raise ValueError(f'Unknown event type: {event}')
    
# Get date, event host city and type of event
def get_tournament_data(tournament_data):
    tournament_split = tournament_data.split()
    date = tournament_split[0]
    event_name = ' '.join(tournament_split[1:-1])
    event_type = get_event_type(tournament_split[-1])

    return date, event_name, event_type

### Iterate through each data file

In [None]:
'''Iterate through a list of CSVs containing results of each fencer '''
current_dir = os.getcwd()
results_dir = os.path.join(current_dir, 'results')

if not os.listdir(results_dir):
    # Download from Google Drive
    gdown.download_folder(
        url='https://drive.google.com/drive/folders/1rkwkF7DOcJhgIw_KXJSV9w4qJN2rq9GT',
        output=results_dir,
        quiet=False,
        use_cookies=False
    )
else:
    data = []
    for results in os.listdir(results_dir):
        results_path = os.path.join(results_dir, results)
        if results_path.endswith('csv'):
            with open(results_path, 'r') as file:
                lines = file.readlines()

                tournaments = lines[3].strip()
                reader = csv.reader(StringIO(tournaments))
                tournaments = list(reader)[0][2:-1]

                for line in lines[4:504]:
                    line = line.strip()
                    columns = line.split(',')
                    current_rank = columns[0]
                    fencer_name = columns[1]
                    nationality = columns[2]
                
                    for i, tournament in enumerate(tournaments):
                        points_index = 3 + i # points start at column 3
                        points = columns[points_index]

                        if points:
                            points = abs(float(points))
                            date, event_name, event_type = get_tournament_data(tournament)
                            data.append([fencer_name, current_rank, nationality, event_name, event_type, points, date])

TypeError: list.append() takes exactly one argument (7 given)

### Store data to Panda dataframe

In [None]:
df = pd.DataFrame(data, columns=['fencer_name', 'current_rank', 'fencer_nationality', 'event_name', 'event_type', 'points', 'date'])
df = df.drop_duplicates(subset=['fencer_name', 'event_name', 'date'], keep='last')
df['date'] = pd.to_datetime(df['date'])
df.to_csv('data.csv', index=False)
df.head()

  df['date'] = pd.to_datetime(df['date'])


Unnamed: 0,fencer_name,current_rank,fencer_nationality,event_name,event_type,points,date
0,1,MASSIALAS Alexander,USA,Bonn,World Cup,2.0,2022-11-11
1,1,MASSIALAS Alexander,USA,Tokyo,World Cup,14.0,2022-09-12
2,1,MASSIALAS Alexander,USA,Paris,World Cup,32.0,2023-12-01
3,1,MASSIALAS Alexander,USA,Turin,Grand Prix,21.0,2023-11-02
4,1,MASSIALAS Alexander,USA,Cairo,World Cup,32.0,2023-02-23


Get additioanl data of each fencer using JSONS

In [None]:
import requests
import time
import fitz
import re


def Get_Fencer_JSON(name):
  url = "https://fie.org/athletes/search"
  payload = {"name": name}
  
  resp = requests.post(url, json=payload, timeout=10)
  resp.raise_for_status()
  return resp.json()

'''This code is actually mostly redundant as the true issue was with retrieving the wrong ID for fencers with the same target name'''
def fetch_dob(id, name, fencer_dict):
  dob = None
  url = f"https://fie.org/athletes/{id}/profile"
  headers = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  }
  resp = requests.get(url, headers=headers, timeout=10)
  resp.raise_for_status()

  # This code was working, then stopped unknownly
  pdf = fitz.open(stream=resp.content, filetype="pdf")
  text = ""
  for page in pdf:
    text += page.get_text()
  
  print(text)
  pdf.close()

  # Use regex to find the date of birth in the text
  pattern = r"Date of birth:\s*\n?\s*(\d{4}-\d{2}-\d{2})"
  match = re.search(pattern, text)
  if match:
    dob = match.group(1)
    print(f"DOB found for {name}")
    print()
  else:
    print(f"Date of birth not found for {name} {id} {url}")
    print(fencer_dict)
    print()
  return dob

def retry_fetch(name, retries=3):
   # Try to fetch data with different variations of the name
      variations = [
        name.split()[0].capitalize() + ' ' + name.split()[1],
        name.split()[0] + ' ' + name.split()[1].lower(),
        # Add more variants if needed
      ]

      for variant in variations:
        print(f"Trying {variant}")
        results = Get_Fencer_JSON(variant)
        fencer_dict = list(results.values())[0][0]
        id = fencer_dict.get('id')
        dob = fencer_dict.get('date')
        if id and dob:
          print(f"Fetched data for {variant}")
          return id, dob

      # Try retrying with original name
      counter = 0
      while(not id or not dob) and counter < retries:
        time.sleep(2)
        counter += 1
        results = Get_Fencer_JSON(name)
        fencer_dict = list(results.values())[0][0]
        id = fencer_dict.get('id')
        dob = fencer_dict.get('date')
      if not id or not dob:
        print(f"Failed to fetch data for {name} after {retries} attempts")
        print()
      else:
        print(f"Fetched data for {name} after {counter} attempts")
        print()
      return None, None

  

In [None]:
names = df['fencer_name'].unique()
df['fie_ID'] = None
df['dob'] = None

''' Fetch FIE IDs and dates of birth for each fencer '''
# There are multiple irrelevant athlete profiles that have the same names of fencers, they often have incomplete JSON data
# We can check that we have the correct fencer by checking the level and weapon in the JSON data
for name in names:
  try:
    results = Get_Fencer_JSON(name)
    results = list(results.values())[0]
    for result in results:
      if result['level'] == 'S':
        fencer_dict = result
        break
      elif result['weapon'] == 'F':
        fencer_dict = result
        break
    if not fencer_dict:
      raise ValueError(f"Could not find correct data for {name}")

    # Ensure that id and dae of birth are present
    try:
      id = fencer_dict['id']
      if not id:
        raise ValueError(f"ID is empty for {name}")    
    except KeyError:
        raise KeyError(f"No id found for {name}")

    try:
      dob = fencer_dict['date']
      if not dob:
        raise ValueError(f"Date of birth is empty for {name}")
    except KeyError:
      raise KeyError(f"No date of birth found for {name}")

    df.loc[df['fencer_name'] == name, 'fie_ID'] = id
    df.loc[df['fencer_name'] == name, 'dob'] = dob

  except requests.exceptions.RequestException as e:
    raise(f"Request error for {name}")
  except IndexError as e:
    print(f"Index error for {name}: {e}")
    print(fencer_dict)
    print()
    continue


In [None]:
df.head()

Unnamed: 0,fencer_name,current_rank,fencer_nationality,event_name,event_type,points,date,fie_ID,dob
0,1,MASSIALAS Alexander,USA,Bonn,World Cup,2.0,2022-11-11,21760,1990-06-08
1,1,MASSIALAS Alexander,USA,Tokyo,World Cup,14.0,2022-09-12,21760,1990-06-08
2,1,MASSIALAS Alexander,USA,Paris,World Cup,32.0,2023-12-01,21760,1990-06-08
3,1,MASSIALAS Alexander,USA,Turin,Grand Prix,21.0,2023-11-02,21760,1990-06-08
4,1,MASSIALAS Alexander,USA,Cairo,World Cup,32.0,2023-02-23,21760,1990-06-08
