<a href="https://colab.research.google.com/github/JoeL-8883/Fencing-Score-Predictor/blob/main/data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import csv
from io import StringIO
import gdown
from bs4 import BeautifulSoup

In [None]:
# Convert event abbreviation to full text for readability
def get_event_type(event):
    event = event.strip('()').upper()

    EVENT_TYPES = {
        'GP': 'Grand Prix',
        'JO': 'Olympics',
        'SA': 'Satellite',
        'A': 'World Cup',
        'CHZ': 'Zonal Championship',
        'CHM': 'World Championship'
    }

    if event in EVENT_TYPES:
        return EVENT_TYPES[event]
    else:
        raise ValueError(f'Unknown event type: {event}')

# Get date, event host city and type of event
def get_tournament_data(tournament_data):
    tournament_split = tournament_data.split()
    date = tournament_split[0]
    host_city = ' '.join(tournament_split[1:-1])
    event_type = get_event_type(tournament_split[-1])

    return date, host_city, event_type

### Download/format raw data



In [None]:
'''Iterate through a list of CSVs containing results of each fencer '''
current_dir = os.getcwd()
results_dir = os.path.join(current_dir, 'results')

def format_data():
  data = []
  for results in os.listdir(results_dir):
      results_path = os.path.join(results_dir, results)
      if results_path.endswith('csv'):
          with open(results_path, 'r') as file:
              lines = file.readlines()
              tournaments = lines[3].strip() # The fourth line is a row of competition details
              reader = csv.reader(StringIO(tournaments))
              tournaments = list(reader)[0][2:-1]
              print(results_path)

              # the 5th row has actual data, and we only want to include top athletes
              for line in lines[4:504]:
                  line = line.strip()
                  columns = line.split(',')
                  rank = columns[0]
                  fencer_name = columns[1]
                  nationality = columns[2]

                  for i, tournament in enumerate(tournaments):
                      points_index = 3 + i # points start at column 3
                      points = columns[points_index]

                      if points:
                          points = abs(float(points))
                          date, host_city, event_type = get_tournament_data(tournament)
                          data.append([fencer_name, rank, nationality, host_city, event_type, points, date])
  return data

# Create the results directory if it doesn't exist
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    # Download from Google Drive
    gdown.download_folder(
        url='https://drive.google.com/drive/folders/1rkwkF7DOcJhgIw_KXJSV9w4qJN2rq9GT',
        output=results_dir,
        quiet=False,
        use_cookies=False
    )

data = format_data()

### Store data to Panda dataframe

In [None]:
df = pd.DataFrame(data, columns=['fencer_name', 'rank', 'nationality', 'host_city', 'event_type', 'points_earned', 'date'])
df = df.drop_duplicates(subset=['fencer_name', 'host_city', 'date'], keep='last')
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%y')
df.to_csv('data.csv', index=False)
df.head()

Get date of birth and FIE ID data of each fencer using JSONS

In [None]:
import requests
import time
import fitz
import re

# Get JSON data of a fencer that includes their FIE ID and date of birth
# !!! This function has now been deprecated, it seems that the search function on the FIE site does not return any results when searching the full name
# Furthermore, JSON data appears to be now more incomplete, which could be due to rate limiting issues.
# Thankfully, the dataset has already been constructed, and can be found here https://www.kaggle.com/datasets/foillee/fencing-points
# However, this means that without a fix, the dataset cannot be updated.
def Get_Fencer_JSON(name):
  url = "https://fie.org/athletes/search"
  payload = {"name": name}
  resp = requests.post(url, json=payload, timeout=10)
  resp.raise_for_status()
  return resp.json()

'''This code is actually mostly redundant as the true issue was with retrieving the wrong ID for fencers with the same target name'''
def fetch_dob(id, name, fencer_dict):
  dob = None
  url = f"https://fie.org/athletes/{id}/profile"
  headers = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  }
  resp = requests.get(url, headers=headers, timeout=10)
  resp.raise_for_status()

  # This code was working, then stopped unknownly
  pdf = fitz.open(stream=resp.content, filetype="pdf")
  text = ""
  for page in pdf:
    text += page.get_text()

  print(text)
  pdf.close()

  # Use regex to find the date of birth in the text
  pattern = r"Date of birth:\s*\n?\s*(\d{4}-\d{2}-\d{2})"
  match = re.search(pattern, text)
  if match:
    dob = match.group(1)
    print(f"DOB found for {name}")
    print()
  else:
    print(f"Date of birth not found for {name} {id} {url}")
    print(fencer_dict)
    print()
  return dob

def retry_fetch(name, retries=3):
   # Try to fetch data with different variations of the name
      variations = [
        name.split()[0].capitalize() + ' ' + name.split()[1],
        name.split()[0] + ' ' + name.split()[1].lower(),
        # Add more variants if needed
      ]

      for variant in variations:
        print(f"Trying {variant}")
        results = Get_Fencer_JSON(variant)
        fencer_dict = list(results.values())[0][0]
        id = fencer_dict.get('id')
        dob = fencer_dict.get('date')
        if id and dob:
          print(f"Fetched data for {variant}")
          return id, dob

      # Try retrying with original name
      counter = 0
      while(not id or not dob) and counter < retries:
        time.sleep(2)
        counter += 1
        results = Get_Fencer_JSON(name)
        fencer_dict = list(results.values())[0][0]
        id = fencer_dict.get('id')
        dob = fencer_dict.get('date')
      if not id or not dob:
        print(f"Failed to fetch data for {name} after {retries} attempts")
        print()
      else:
        print(f"Fetched data for {name} after {counter} attempts")
        print()
      return None, None



In [None]:
'''Note this code takes a long time to run as it fetches data for each fencer individually from the FIE website'''
names = df['fencer_name'].unique()
df['fie_ID'] = None
df['dob'] = None

''' Fetch FIE IDs and dates of birth for each fencer '''
# There are multiple irrelevant athlete profiles that have the same names of fencers, they often have incomplete JSON data
# We can check that we have the correct fencer by checking the level and weapon in the JSON data
for name in names:
  surname = name.split()[0]
  try:
    results = Get_Fencer_JSON(surname)
    json_data = list(results.values())

    # Handle empty JSON data
    if json_data is None or len(json_data) == 0:
      # Retry getting data if json is empty
      max_retries = 3
      for retry in range(max_retries):
        try:
          results = Get_Fencer_JSON(name)
          json_data = list(results.values())[0]
          if json_data or len(json_data) > 0:
            break
        except requests.exceptions.RequestException as e:
          time.sleep(2)
        time.sleep(2)
      if json_data is None or len(json_data) == 0:
        print(f"No data found for {name} after {max_retries} retries")
        print("Warning - fencer will be removed from the dataset\n")
        df = df[df['fencer_name'] != name]
        continue

    else:
      json_data = json_data[0] # json data is nested in an extra list

      # Validate that we have the correct fencer
      # Note: this assumes that senior male foilist fencers have unique names
      for fencer in json_data:
        if fencer['level'] == 'S' and fencer['weapon'] == 'F' and fencer['name'] == name:
          fencer_dict = fencer
          break

      if not fencer_dict:
        raise ValueError(f"Could not find correct data for {name}")

      # Ensure that id and date of birth are present
      try:
        id = fencer_dict['id']
        if not id:
          raise ValueError(f"ID is empty for {name}")
      except KeyError:
          raise KeyError(f"No id found for {name}")

      try:
        dob = fencer_dict['date']
        if not dob:
          raise ValueError(f"Date of birth is empty for {name}")
      except KeyError:
        raise KeyError(f"No date of birth found for {name}")

      df.loc[df['fencer_name'] == name, 'fie_ID'] = id
      df.loc[df['fencer_name'] == name, 'dob'] = dob

  except requests.exceptions.RequestException as e:
    raise(f"Request error for {name}")
  # There are consistent index errors for few fencers and their JSON data seems inconsistent
  # So it is fine to just remove them from the dataset as they are not key fencers
  except IndexError as e:
    print(f"Index error for {name}: {e}")
    print(fencer_dict)
    print("Warning - fencer will be removed from the dataset")
    df = df[df['fencer_name'] != name]
    print()
    continue


Determine ages of fencers at time of competition

In [None]:
'''Verify that all fencers in df have a dob'''
df['dob'] = pd.to_datetime(df['dob'], format='%Y-%m-%d', errors='coerce')

# Some FIE profiles have an incorrect date of birth format, we just remove these fencers from the dataset
# i.e. see https://fie.org/athletes/38249/profile
missing_age_rows = df[df['dob'].isna()]
missing_fencer_names = missing_age_rows['fencer_name'].unique().tolist()

for name in missing_fencer_names:
    print(f"Removing fencer {name} due to missing or incorrect date of birth")
    df = df[df['fencer_name'] != name]

df['age'] = ( (df['date'] - df['dob']).dt.days // 365.25).astype(int)


In [None]:
'''Reorganise columns for better readability'''
new_column_order = [
    'fie_ID',
    'fencer_name',
    'age',
    'dob',
    'nationality',  # I assume this is your "nationality" column
    'rank',        # I assume this is your "rank" column
    'host_city',
    'event_type',
    'points_earne',
    'date'
]

df = df[new_column_order]
df.to_csv('data.csv', index=False)
df.head()