In [6]:
import xgboost as xgb
import pandas as pd
import json
from pathlib import Path
import plotly.express as px
from typing import Callable, List

# XGBoost Model Maken en Gebruiken

In [3]:
# Als eerste openen we de features

features_file = Path(r"data/features.txt")
with open(features_file, "r") as f:
    features = json.load(f)

In [4]:
df = pd.DataFrame(features)
df.head(5)

Unnamed: 0,rider_name,age,pcs_score,pcs_ratio,pcs_team,top_5s_category_0,top_10s_category_0,top_15s_category_0,top_20s_category_0,top_5s_category_1,...,top_20s_category_3,top_5s_category_4,top_10s_category_4,top_15s_category_4,top_20s_category_4,top_5s_category_gc,top_10s_category_gc,top_15s_category_gc,top_20s_category_gc,wielerpoule_score
0,ARMSTRONG Lance,30,656,0.1288,5094,0,0,0,0,0,...,0,2,0,1,0,0,0,0,0,109
1,HERAS Roberto,27,657,0.129,5094,0,0,0,0,0,...,0,7,1,3,0,2,0,0,0,17
2,EKIMOV Viatcheslav,35,376,0.0738,5094,0,0,0,0,2,...,0,2,0,1,0,2,1,0,0,0
3,HAMILTON Tyler,30,214,0.042,5094,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
4,HINCAPIE George,28,863,0.1694,5094,1,0,0,0,1,...,0,3,3,1,0,1,1,0,0,0


## Data opschonen 

Als eerste vullen we de missende waarden voor de kolommen age, pcs_score, pcs_ratio en pcs_team aan met het gemiddelde.

In [49]:
def fill_missing_values(df: pd.DataFrame, 
                        f: Callable, columns: 
                        List[str]) -> pd.DataFrame:
    df_copy = df.copy(deep=True)

    for column in columns:
        df_copy[column] = f(df[column])

    return df_copy

In [50]:
def fill_with_mean(column: pd.Series) -> pd.Series:
    column_copy = column.copy(deep=True)
    
    mean = column.loc[column > 0].mean()
    column_copy.loc[column == 0] = mean 

    return column_copy

In [51]:
df = fill_missing_values(df, fill_with_mean, ["age", "pcs_score", "pcs_ratio", "pcs_team"])


In [38]:
# Assert that all values are filled
assert len(df["age"][df["age"] == 0]) == 0
assert len(df["pcs_score"][df["pcs_score"] == 0]) == 0
assert len(df["pcs_ratio"][df["pcs_ratio"] == 0]) == 0
assert len(df["pcs_team"][df["pcs_team"] == 0]) == 0

Nu gaan we alle features normaliseren, dit schijnt je modellen beter en sneller te trainen.

In [39]:
# Wil de wielerpoulescore niet normalizen
columns_to_normalize = list(df.columns)[1:-1]
columns_to_normalize

['age',
 'pcs_score',
 'pcs_ratio',
 'pcs_team',
 'top_5s_category_0',
 'top_10s_category_0',
 'top_15s_category_0',
 'top_20s_category_0',
 'top_5s_category_1',
 'top_10s_category_1',
 'top_15s_category_1',
 'top_20s_category_1',
 'top_5s_category_2',
 'top_10s_category_2',
 'top_15s_category_2',
 'top_20s_category_2',
 'top_5s_category_3',
 'top_10s_category_3',
 'top_15s_category_3',
 'top_20s_category_3',
 'top_5s_category_4',
 'top_10s_category_4',
 'top_15s_category_4',
 'top_20s_category_4',
 'top_5s_category_gc',
 'top_10s_category_gc',
 'top_15s_category_gc',
 'top_20s_category_gc']

In [40]:
# min-max normalisatie
for column in columns_to_normalize:
    df[column] = (df[column]-df[column].min())/(df[column].max()-df[column].min())

In [41]:
df.head(5)

Unnamed: 0,rider_name,age,pcs_score,pcs_ratio,pcs_team,top_5s_category_0,top_10s_category_0,top_15s_category_0,top_20s_category_0,top_5s_category_1,...,top_20s_category_3,top_5s_category_4,top_10s_category_4,top_15s_category_4,top_20s_category_4,top_5s_category_gc,top_10s_category_gc,top_15s_category_gc,top_20s_category_gc,wielerpoule_score
0,ARMSTRONG Lance,0.434783,0.196919,0.128277,0.341678,0.0,0.0,0.0,,0.0,...,,0.08,0.0,0.052632,,0.0,0.0,0.0,,109
1,HERAS Roberto,0.304348,0.197221,0.128477,0.341678,0.0,0.0,0.0,,0.0,...,,0.28,0.066667,0.157895,,0.285714,0.0,0.0,,17
2,EKIMOV Viatcheslav,0.652174,0.112353,0.073244,0.341678,0.0,0.0,0.0,,0.1,...,,0.08,0.0,0.052632,,0.285714,0.2,0.0,,0
3,HAMILTON Tyler,0.434783,0.063425,0.041425,0.341678,0.0,0.0,0.0,,0.0,...,,0.0,0.066667,0.0,,0.142857,0.0,0.0,,0
4,HINCAPIE George,0.347826,0.259438,0.168901,0.341678,0.125,0.0,0.0,,0.05,...,,0.12,0.2,0.052632,,0.142857,0.2,0.0,,0


In [42]:
# Drop NaN columns
# Waarschijnlijk zijn die allemaal 0 en daarom NaN geworden
df = df.dropna(axis=1)
df.head(5)

Unnamed: 0,rider_name,age,pcs_score,pcs_ratio,pcs_team,top_5s_category_0,top_10s_category_0,top_15s_category_0,top_5s_category_1,top_10s_category_1,...,top_5s_category_3,top_10s_category_3,top_15s_category_3,top_5s_category_4,top_10s_category_4,top_15s_category_4,top_5s_category_gc,top_10s_category_gc,top_15s_category_gc,wielerpoule_score
0,ARMSTRONG Lance,0.434783,0.196919,0.128277,0.341678,0.0,0.0,0.0,0.0,0.0,...,0.095238,0.0,0.0,0.08,0.0,0.052632,0.0,0.0,0.0,109
1,HERAS Roberto,0.304348,0.197221,0.128477,0.341678,0.0,0.0,0.0,0.0,0.0,...,0.047619,0.0,0.0,0.28,0.066667,0.157895,0.285714,0.0,0.0,17
2,EKIMOV Viatcheslav,0.652174,0.112353,0.073244,0.341678,0.0,0.0,0.0,0.1,0.0,...,0.095238,0.083333,0.1,0.08,0.0,0.052632,0.285714,0.2,0.0,0
3,HAMILTON Tyler,0.434783,0.063425,0.041425,0.341678,0.0,0.0,0.0,0.0,0.0,...,0.142857,0.083333,0.0,0.0,0.066667,0.0,0.142857,0.0,0.0,0
4,HINCAPIE George,0.347826,0.259438,0.168901,0.341678,0.125,0.0,0.0,0.05,0.0,...,0.190476,0.333333,0.0,0.12,0.2,0.052632,0.142857,0.2,0.0,0


Sla een lijst op van de kolommen die je features zijn

In [43]:
column_features = list(df.columns)[1:-1]
column_features

['age',
 'pcs_score',
 'pcs_ratio',
 'pcs_team',
 'top_5s_category_0',
 'top_10s_category_0',
 'top_15s_category_0',
 'top_5s_category_1',
 'top_10s_category_1',
 'top_15s_category_1',
 'top_5s_category_2',
 'top_10s_category_2',
 'top_15s_category_2',
 'top_5s_category_3',
 'top_10s_category_3',
 'top_15s_category_3',
 'top_5s_category_4',
 'top_10s_category_4',
 'top_15s_category_4',
 'top_5s_category_gc',
 'top_10s_category_gc',
 'top_15s_category_gc']

Sla de kolom op wat je target is (hetgeen wat je wilt voorspellen)

In [44]:
column_label = "wielerpoule_score"

## Data Splitsen in Train en Test data

We maken gebruik van de Sklearn package, deze heeft handige functies zoals de train_test_split om je te helpen.

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [47]:
train_target = train_df["wielerpoule_score"]

In [48]:
# De rider name en wielerpoule_score willen wij niet als features hebben!
train_df = train_df.drop(columns=["rider_name", "wielerpoule_score"], axis=1)
train_df.head(5)

Unnamed: 0,age,pcs_score,pcs_ratio,pcs_team,top_5s_category_0,top_10s_category_0,top_15s_category_0,top_5s_category_1,top_10s_category_1,top_15s_category_1,...,top_15s_category_2,top_5s_category_3,top_10s_category_3,top_15s_category_3,top_5s_category_4,top_10s_category_4,top_15s_category_4,top_5s_category_gc,top_10s_category_gc,top_15s_category_gc
3683,0.478261,0.04923,0.055733,0.194662,0.125,0.0,0.0,0.15,0.125,0.0,...,0.0,0.142857,0.0,0.0,0.0,0.2,0.0,0.285714,0.0,0.0
109,0.521739,0.057384,0.061737,0.204896,0.0,0.0,0.0,0.05,0.125,0.142857,...,0.1,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.2
347,0.304348,0.162187,0.106164,0.340156,0.0,0.0,0.0,0.1,0.0,0.285714,...,0.2,0.047619,0.0,0.4,0.12,0.0,0.105263,0.142857,0.0,0.0
3323,0.565217,0.048022,0.017511,0.611438,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1,0.190476,0.25,0.2,0.04,0.2,0.263158,0.0,0.0,0.0
1260,0.347826,0.059801,0.020913,0.639997,0.0,0.0,0.25,0.0,0.125,0.0,...,0.1,0.0,0.166667,0.1,0.0,0.0,0.0,0.0,0.0,0.2


## Model Trainen

We trainen een XGBoos Regressor.

Om de hyperparameters te tunen gebruiken we van Sklearn de GridSearchCV class.

In [145]:
model = xgb.XGBRegressor()

In [146]:
from sklearn.model_selection import GridSearchCV

In [147]:
# Van deze parameters met deze waardes willen wij de optimale combinatie weten
parameters = {
    "n_estimators": [500, 1000, 2000, 4000],
    "learning_rate": (0.003, 0.01, 0.03, 0.10, 0.3),
    "max_depth": [1, 2, 3, 4, 5, 6, 8],
    "min_child_weight": [ 1, 3, 5, 7],
    "gamma":[0.00, 0.01, 0.03, 0.1, 0.3],
    "colsample_bytree":[ 0.3, 0.4],
    "subsample": [0.6, 0.7, 0.8]
}

In [148]:
# Belangrijk hierbij is de cv parameter
# Deze geeft in dit geval aan 3-voudige cross validation aan
gsc = GridSearchCV(
    estimator=model,
    param_grid=parameters,
    cv=3,
    scoring="neg_mean_squared_error",
    verbose=0,
    n_jobs=-1,
)

In [149]:
# Start trainen
grid_result = gsc.fit(train_df, train_target)


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



In [150]:
# Zo krijg je de beste parameters
best_params = grid_result.best_params_
best_params

{'colsample_bytree': 0.4,
 'gamma': 0.0,
 'learning_rate': 0.003,
 'max_depth': 5,
 'min_child_weight': 7,
 'n_estimators': 2000,
 'subsample': 0.7}

Nog 1x trainen op complete dataset met de beste parameters van hierboven.

In [151]:
final_model = xgb.XGBRegressor(**best_params)

In [152]:
complete_dataset_features = df.drop(columns=["rider_name", "wielerpoule_score"], axis=1)

In [153]:
complete_dataset_target = df["wielerpoule_score"]

In [154]:
final_model.fit(complete_dataset_features, complete_dataset_target)

Hier slaan we het model op

In [174]:
final_model_file = Path(r"models/v1_xgboost.pkl")

In [175]:
final_model.save_model(final_model_file)

## De Voorspelling Doen!

En dan nu, het moment waarop we hebben gewacht! Het voorspellen :D

In [53]:
last_tour_features_file = Path(r"data/last_tour_features.txt")

In [54]:
with open(last_tour_features_file, "r") as f:
    tour_features = json.load(f)

In [55]:
tour_df = pd.DataFrame(tour_features)
tour_df.head(5)

Unnamed: 0,rider_name,age,pcs_score,pcs_ratio,pcs_team,top_5s_category_0,top_10s_category_0,top_15s_category_0,top_20s_category_0,top_5s_category_1,...,top_20s_category_3,top_5s_category_4,top_10s_category_4,top_15s_category_4,top_20s_category_4,top_5s_category_gc,top_10s_category_gc,top_15s_category_gc,top_20s_category_gc,wielerpoule_score
0,POGAČAR Tadej,24,1499,0.475,3156,0,0,0,0,0,...,0,8,1,1,0,3,0,0,0,0
1,BENNETT George,32,45,0.0143,3156,1,1,1,0,0,...,0,0,0,0,0,1,0,3,0,0
2,BJERG Mikkel,24,31,0.0098,3156,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,HIRSCHI Marc,24,425,0.1347,3156,1,1,0,0,0,...,0,1,5,1,0,2,0,1,0,0
4,LAENGEN Vegard Stake,33,44,0.0139,3156,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Net als bij het vorige deel van het data processen moeten we hier hetzelfde doen!

In [56]:
tour_df = fill_missing_values(tour_df, fill_with_mean, ["age", "pcs_score", "pcs_ratio", "pcs_team"])

In [57]:
# Drop NaN columns
# Waarschijnlijk zijn die allemaal 0 en daarom NaN geworden
df = df.dropna(axis=1)
df.head(5)

Unnamed: 0,rider_name,age,pcs_score,pcs_ratio,pcs_team,top_5s_category_0,top_10s_category_0,top_15s_category_0,top_5s_category_1,top_10s_category_1,...,top_5s_category_3,top_10s_category_3,top_15s_category_3,top_5s_category_4,top_10s_category_4,top_15s_category_4,top_5s_category_gc,top_10s_category_gc,top_15s_category_gc,wielerpoule_score
0,ARMSTRONG Lance,0.434783,0.196919,0.128277,0.341678,0.0,0.0,0.0,0.0,0.0,...,0.095238,0.0,0.0,0.08,0.0,0.052632,0.0,0.0,0.0,109
1,HERAS Roberto,0.304348,0.197221,0.128477,0.341678,0.0,0.0,0.0,0.0,0.0,...,0.047619,0.0,0.0,0.28,0.066667,0.157895,0.285714,0.0,0.0,17
2,EKIMOV Viatcheslav,0.652174,0.112353,0.073244,0.341678,0.0,0.0,0.0,0.1,0.0,...,0.095238,0.083333,0.1,0.08,0.0,0.052632,0.285714,0.2,0.0,0
3,HAMILTON Tyler,0.434783,0.063425,0.041425,0.341678,0.0,0.0,0.0,0.0,0.0,...,0.142857,0.083333,0.0,0.0,0.066667,0.0,0.142857,0.0,0.0,0
4,HINCAPIE George,0.347826,0.259438,0.168901,0.341678,0.125,0.0,0.0,0.05,0.0,...,0.190476,0.333333,0.0,0.12,0.2,0.052632,0.142857,0.2,0.0,0


En dan moeten wij ook weer de features normaliseren

In [58]:
# Wil de wielerpoulescore niet normalizen
columns_to_normalize = list(tour_df.columns)[1:-1]
columns_to_normalize

['age',
 'pcs_score',
 'pcs_ratio',
 'pcs_team',
 'top_5s_category_0',
 'top_10s_category_0',
 'top_15s_category_0',
 'top_20s_category_0',
 'top_5s_category_1',
 'top_10s_category_1',
 'top_15s_category_1',
 'top_20s_category_1',
 'top_5s_category_2',
 'top_10s_category_2',
 'top_15s_category_2',
 'top_20s_category_2',
 'top_5s_category_3',
 'top_10s_category_3',
 'top_15s_category_3',
 'top_20s_category_3',
 'top_5s_category_4',
 'top_10s_category_4',
 'top_15s_category_4',
 'top_20s_category_4',
 'top_5s_category_gc',
 'top_10s_category_gc',
 'top_15s_category_gc',
 'top_20s_category_gc']

In [59]:
for column in columns_to_normalize:
    tour_df[column] = (tour_df[column]-tour_df[column].min())/(tour_df[column].max()-tour_df[column].min())

In de train DataFrame hadden we een aantal kolommen met NaN en deze kolommen hebben wij verwijderd. Maar in deze tour DataFrame zijn er geen NaN waarden, toch moeten wij de features verwijderen die niet gebruikt zijn tijdens het trainen. Want het model weet niet wat die met deze features moet. Daarom hebben we hier een lijst met alle features die gebruikt zijn voor het trainen zodat wij uit de Tour DataFrame alleen deze kolommen pakken.

In [61]:
trained_features = ['age',
 'pcs_score',
 'pcs_ratio',
 'pcs_team',
 'top_5s_category_0',
 'top_10s_category_0',
 'top_15s_category_0',
 'top_5s_category_1',
 'top_10s_category_1',
 'top_15s_category_1',
 'top_5s_category_2',
 'top_10s_category_2',
 'top_15s_category_2',
 'top_5s_category_3',
 'top_10s_category_3',
 'top_15s_category_3',
 'top_5s_category_4',
 'top_10s_category_4',
 'top_15s_category_4',
 'top_5s_category_gc',
 'top_10s_category_gc',
 'top_15s_category_gc']

In [62]:
df_for_prediction = tour_df[trained_features]
df_for_prediction.head()

Unnamed: 0,age,pcs_score,pcs_ratio,pcs_team,top_5s_category_0,top_10s_category_0,top_15s_category_0,top_5s_category_1,top_10s_category_1,top_15s_category_1,...,top_15s_category_2,top_5s_category_3,top_10s_category_3,top_15s_category_3,top_5s_category_4,top_10s_category_4,top_15s_category_4,top_5s_category_gc,top_10s_category_gc,top_15s_category_gc
0,0.111111,1.0,0.766645,0.737887,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.142857,0.571429,0.1,0.2,0.75,0.0,0.0
1,0.555556,0.026774,0.018513,0.737887,0.25,0.25,0.166667,0.0,0.0,0.333333,...,0.428571,0.0,0.0,0.142857,0.0,0.0,0.0,0.25,0.0,0.75
2,0.111111,0.017403,0.011205,0.737887,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.142857,0.166667,0.0,0.071429,0.0,0.0,0.0,0.0,0.0
3,0.111111,0.281124,0.214031,0.737887,0.25,0.25,0.0,0.0,0.0,0.0,...,0.142857,0.571429,0.5,0.0,0.071429,0.5,0.2,0.5,0.0,0.25
4,0.611111,0.026104,0.017863,0.737887,0.0,0.0,0.0,0.0,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
# Sla de voorspellingen op in de prediction kolom
tour_df["prediction"] = final_model.predict(df_for_prediction)

In [173]:
tour_df.head(5)

Unnamed: 0,rider_name,age,pcs_score,pcs_ratio,pcs_team,top_5s_category_0,top_10s_category_0,top_15s_category_0,top_20s_category_0,top_5s_category_1,...,top_5s_category_4,top_10s_category_4,top_15s_category_4,top_20s_category_4,top_5s_category_gc,top_10s_category_gc,top_15s_category_gc,top_20s_category_gc,wielerpoule_score,prediction
0,POGAČAR Tadej,0.111111,1.0,0.766645,0.761216,0.0,0.0,0.0,,0.0,...,0.571429,0.1,0.2,,0.75,0.0,0.0,,0,70.501328
1,BENNETT George,0.555556,0.026774,0.018513,0.761216,0.25,0.25,0.166667,,0.0,...,0.0,0.0,0.0,,0.25,0.0,0.75,,0,0.817355
2,BJERG Mikkel,0.111111,0.017403,0.011205,0.761216,0.0,0.0,0.0,,0.0,...,0.071429,0.0,0.0,,0.0,0.0,0.0,,0,5.446099
3,HIRSCHI Marc,0.111111,0.281124,0.214031,0.761216,0.25,0.25,0.0,,0.0,...,0.071429,0.5,0.2,,0.5,0.0,0.25,,0,24.23419
4,LAENGEN Vegard Stake,0.611111,0.026104,0.017863,0.761216,0.0,0.0,0.0,,0.0,...,0.0,0.0,0.0,,0.0,0.0,0.0,,0,0.960067


Sla de voorspellingen op!

In [182]:
predictions_file = Path(r"data/predictions.txt")

In [183]:
tour_df.to_json(predictions_file)