# Notebook To Make The Features Used for XGBoost

The current features are:
* For every profile_icon (p1, p2, p3, p4, p5), the numbers of placing (1-4), (5-8), (9-12) in the season
* Age
* Nationality
* Individual PCS points
* Team PCS points

And I think the model can be improved by including the weather combined to placing.

## Plan

1. Loop over rider_info csv per year.
1. Get every race for a top 12 result. Combine these as key -> value in a dict
1. Search the race information for every race and put that in dicts for every profile class:
    1. 0.0 = flat
    1. 1.0 = hills (flat finish)
    1. 2.0 = hills (uphill finish)
    1. 3.0 = mountain (flat finish)
    1. 4.0 = mountain (uphill finish)

In [21]:
# imports
import os
import pandas as pd

In [22]:
# load some global variables
csv_extension = '.csv'

# Rider info folder
riders_folder = 'riders/'
riders_base_filename = 'rider_info_' #  + year and extension
# Results folder
results_folder = 'results/'
results_base_filename = 'comp_' # + year and extension
# race information file
race_information_filename = 'complete_and_predicted_race_information_v1.csv'

In [23]:
# Load the race information in a dataframe
race_information_df = pd.read_csv(race_information_filename, delimiter=';')
race_information_df.head(5)

Unnamed: 0.1,Unnamed: 0,race,profile_score,point_scale,img_name,profile_icon,location
0,0,race/dauphine/2020/stage-1,107,2.WT.Stage,imgs/107race-dauphine-2020-stage-1.jpg,4.0,Grenoble
1,1,race/dauphine/2020/stage-2,0,,imgs/0race-dauphine-2020-stage-2.jpg,4.0,Grenoble
2,2,race/dauphine/2020/stage-3,0,,imgs/0race-dauphine-2020-stage-3.jpg,4.0,Grenoble
3,3,race/dauphine/2020/stage-4,0,,imgs/0race-dauphine-2020-stage-4.jpg,4.0,Grenoble
4,4,race/dauphine/2020/stage-5,0,,imgs/0race-dauphine-2020-stage-5.jpg,4.0,Grenoble


### Loop over the years and make csv files for the features

In [24]:
# Dictionary to map features
features_dict = {'0' : 'flat',
                 '1': 'hills(flat)',
                 '2': 'hills(uphill)',
                 '3': 'mountains(flat)',
                 '4': 'mountains(uphill)',
                'gc': 'gc',
                'wielerpoule': 'wielerpoule'}

In [25]:
# Loop over each year, to create a feature file for each year
years = [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]

for year in years:

    # Riders file
    riders_filename = riders_base_filename + str(year) + csv_extension
    riders_file_location = os.path.join(riders_folder, riders_filename)
    riders_df = pd.read_csv(riders_file_location, delimiter=';')

    # Results file
    results_filename = results_base_filename + str(year) + csv_extension
    results_file_location = os.path.join(results_folder, results_filename)
    results_df = pd.read_csv(results_file_location, delimiter=',')
    results_df = results_df.loc[:, ~results_df.columns.str.contains('^Unnamed')]
    
    # initiate dictionary for rider results
    riders = {}
    
    # loop over riders and create a dictionary with placements
    for ind, rider_row in riders_df.iterrows():
        rider = rider_row['rider']
        # 0 is flat, 1 is hills (flat finish), 2 is hills (uphill finish), 3 is mountains (flat finish), 4 is mountains (uphill finish)
        riders[rider] = {'0': {'1-4':0,
                            '5-8': 0,
                            '9-12': 0},
                        '1': {'1-4':0,
                            '5-8': 0,
                            '9-12': 0},
                        '2': {'1-4':0,
                            '5-8': 0,
                            '9-12': 0},
                        '3': {'1-4':0,
                            '5-8': 0,
                            '9-12': 0},
                        '4': {'1-4':0,
                            '5-8': 0,
                            '9-12': 0},
                        'gc': {'1-4':0,
                            '5-8': 0,
                            '9-12': 0},
                        'wielerpoule': 0}

        # Obtain results for the rider
        results = results_df.loc[results_df['Rider'] == rider]
        results_dict = results.to_dict()

        # Loop over the races and search for the race information
        # Combine this with the placing in the features
        # If the race is tour de france, combine placing to wielerpoule points
        for race in results.columns:

                try:
                    place = results_dict[race][ind]
                except Exception as e:
                    break

                if type(place) is int:

                    if 'tour-de-france' not in race:

                        # GC results don't have profile information
                        if not race.endswith('gc'):  
                            if place < 13 and place != 0:
                                    race_info = race_information_df.loc[race_information_df['race'] == race]
                                    if len(race_info) > 0:

                                        profile = int(race_info['profile_icon'])

                                        if place > 0 and place < 5:
                                            place_key = '1-4'
                                        elif place > 4 and place < 9:
                                            place_key = '5-8'
                                        else:
                                            place_key = '9-12'


                                        riders[rider][str(profile)][place_key] += 1

                        else:
                            if place < 15 and place != 0:

                                if place > 0 and place < 5:
                                    place_key = '1-4'
                                elif place > 4 and place < 9:
                                    place_key = '5-8'
                                else:
                                    place_key = '9-12'

                                riders[rider]['gc'][place_key] += 1

                    else:
                        riders[rider]['wielerpoule'] += place
                        
    # Copy riders information
    feature_df = riders_df.copy()
    # remove unnamed column
    feature_df = feature_df.loc[:, ~feature_df.columns.str.contains('^Unnamed')]
    # add columns
    columns = ['flat1-4', 'flat5-8', 'flat9-12', 'hills(flat)1-4', 'hills(flat)5-8', 'hills(flat)9-12',
              'hills(uphill)1-4', 'hills(uphill)5-8', 'hills(uphill)9-12', 'mountains(flat)1-4', 'mountains(flat)5-8', 'mountains(flat)9-12',
              'mountains(uphill)1-4', 'mountains(uphill)5-8', 'mountains(uphill)9-12', 'gc1-4', 'gc5-8', 'gc9-12', 'wielerpoule']
    feature_df = feature_df.reindex(feature_df.columns.tolist() + columns, axis=1)
    
    # Paste the rider results in the features
    for rider, features in riders.items():
        for profile, placements in features.items():
            if profile != 'wielerpoule':
                for placement, number_of_times in placements.items():
                    feature_name = features_dict[profile] + placement
                    feature_df.loc[feature_df['rider'] == rider, feature_name] = number_of_times
            else:
                feature_df.loc[feature_df['rider'] == rider, 'wielerpoule'] = placements
                
    # Save information as csv file
    output_filename = 'features_' + str(year) + '.csv'
    feature_df.to_csv(output_filename, sep=';')