In [None]:
import pandas as pd
import numpy as np
import re
import glob
import os
import datetime
import matplotlib.pyplot as plt

### env variables

In [None]:
BASE_PATH = '../data/pcs-scraping'
RESULTS_PATH = '../data/pcs-scraping/results/rider'
RANKINGS_PATH = '../data/pcs-scraping/pcs-rankings/rider'
TEAMS_PATH = '../data/pcs-scraping/teams/rider'
CALENDARS_PATH = '../data/pcs-scraping/calendars'
STARTLISTS_PATH = '../data/pcs-scraping/startlists'
RACERESULTS_PATH = '../data/pcs-scraping/race_results'

### Load data

In [None]:
rider_names = list(pd.read_csv('../data/pcs-scraping/rider_names.csv'))

In [None]:
rider_name = rider_names[0]

df_rider = {rider_name: {}}

for file in glob.glob(os.path.join(RESULTS_PATH, rider_name, '*.csv')):
    
    year = re.search('\d{4}', file).group(0)
    df_rider[rider_name][year] = pd.read_csv(file)


In [None]:
df_rider[rider_name]['2021']

# Expert model

- competitor with most victories/top X in the last N races is predicted as race winner

### Data Normalization

In [None]:
year = '2021'
df_season = {year: {}}
df_res = {}

for rider_name in rider_names[:100]:
    for file in glob.glob(os.path.join(RESULTS_PATH, rider_name, f'{year}.csv')):

        df_season[year][rider_name] = pd.read_csv(file)
        
        # Filter NaN values and non int
        df_results = df_season[year][rider_name][['Date', 'Result', 'Race']].dropna(axis=0)
        df_results = df_results[(df_results.Result != 'DNF') & (df_results.Result != 'DNS') & (df_results.Result != 'DSQ') & (df_results.Result != 'OTL')]
        results = [int(x) for x in df_results.Result]
        
        df_results['Result'] = results
        df_res[rider_name] = df_results

### Compute Model Features (Win Statistics)

In [None]:
def GetStats(df):
    
    stats = {}
    
    stats['wins'] = (df.Result == 1).sum()
    stats['top_3'] = (df.Result <= 3).sum()
    stats['top_5'] = (df.Result <= 5).sum()
    stats['top_10'] = (df.Result <= 10).sum()
    stats['top_20'] = (df.Result <= 20).sum()
    
    return stats

In [None]:
year = '2021'
stats = {year: {}}

for rider_name in rider_names[:10]:
    
    stats[year][rider_name] = GetStats(df_res[rider_name])

In [None]:
stats['2021']['adam-yates']

### Rename result column with rider name

In [None]:
for rider_name in rider_names[:10]:
    
    df_res[rider_name].rename(columns={'Result': f'Result-{rider_name}'}, inplace=True)

### Merge dataframes

In [None]:
pd.merge(df_res['wout-van-aert'], df_res['mathieu-van-der-poel'], on='Race')

 # Compute Features for each participant at race

### Get race startlist

In [None]:
startlist = pd.read_csv(os.path.join(STARTLISTS_PATH, 'giro-d-italia', '2021', 'startlist.csv'))
startlist.head()

### Normalize startlist names

In [None]:
def normalize_name(name):
    
    norm_name = '-'.join(reversed(name.lower().replace(' ', '-').split('-')))
    
    return norm_name

In [None]:
startlist_norm = [normalize_name(name) for name in startlist.Ridername]
startlist_norm

### Get competing riders

In [None]:
riders_competing = []

for rider_name in rider_names[:100]:
    if rider_name in startlist_norm:
        riders_competing.append(rider_name)
        
riders_competing

### Normalize dates to datetime

In [None]:
def convert_to_datetime(date_string, year):
    
    return datetime.datetime.strptime(date_string, '%d.%m').replace(year=year)

def normalize_date_data(riders_competing):
    
    df = {k: v for k, v in df_res.items() if k in riders_competing}
    
    for rider in riders_competing:
        datetimes = [convert_to_datetime(date, int(year)) for date in df[rider].Date]
        df[rider].Date = datetimes
    
    return df

def create_date_mask(df, event, year, timedelta, riders_competing):
    
    calendar = pd.read_csv(os.path.join(CALENDARS_PATH, f'{year}.csv'))
    event_date = calendar[calendar.Race.str.contains(event)].Date.values[0].split('- ')[-1]
    
    event_date = datetime.datetime.strptime(f'{event_date}.{year}', '%d.%m.%Y')
    start_date = event_date - datetime.timedelta(weeks=timedelta)
    
    for rider in riders_competing:
        mask = (df[rider].Date > start_date) & (df[rider].Date <= event_date)
        df[rider] = df[rider].loc[mask]

    return df

def compute_features(riders_competing, event, year, timedelta):
    
    df = normalize_date_data(riders_competing)
        
    df_mask = create_date_mask(df, event, year, timedelta, riders_competing)
    
    stats = {}
    for rider in riders_competing:
        stats[rider] = GetStats(df_mask[rider])
        
    return stats

In [None]:
features = compute_features(riders_competing, "Giro d'Italia", '2021', 12)

In [None]:
weights = np.array([10, 8, 6, 4, 2])

score = {}
for rider in riders_competing:
    score[rider] = [np.average(np.array(list(features[rider].values())), weights=weights)]
    
#score = pd.DataFrame({'Score': score}).sort_values(by='Score', ascending=False)

In [None]:
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(14, 14))
pd.DataFrame(features).plot(kind='bar', ax=ax1, colormap='tab20c')
pd.DataFrame(score).plot(kind='bar', ax=ax2, colormap='tab20c')

ax1.set_ylabel('Count')
ax2.set_ylabel('Score')
ax2.set_xticklabels(["Giro d'Italia 2021"], rotation=0)
ax1.legend(prop={'size': 7})
ax2.legend(prop={'size': 8})

plt.show()

# Model Hyperparams

- timeframe to consider when computing win features (eg. 2 weeks, 2 months, etc.)
- only consider timeframe or number of N races before event ?
- how many (which) win features to include ?
- weighting (linear, exp, etc.) of features to create score ?