In [1]:
import pandas as pd
import numpy as np
import re
import glob
import os
import datetime
import matplotlib.pyplot as plt
import itertools

### env variables

In [None]:
BASE_PATH = '../data/pcs-scraping'
RESULTS_PATH = '../data/pcs-scraping/results/rider'
RANKINGS_PATH = '../data/pcs-scraping/pcs-rankings/rider'
TEAMS_PATH = '../data/pcs-scraping/teams/rider'
CALENDARS_PATH = '../data/pcs-scraping/calendars'
STARTLISTS_PATH = '../data/pcs-scraping/startlists'
RACERESULTS_PATH = '../data/pcs-scraping/race_results'
IMG_PATH = '../data/pcs-scraping/img/rider'
RIDERSTATS_PATH = '../data/pcs-scraping/rider_stats/rider'

### Load data

In [None]:
rider_names = list(pd.read_csv('../data/pcs-scraping/rider_names.csv'))

In [None]:
race = 'milano-sanremo'
year = '2022'
stats = pd.read_csv(os.path.join(STARTLISTS_PATH, race, year, 'stats-kpis.csv'))

### Clean data

In [None]:
stats.dropna(inplace=True)
stats = stats[stats['Height▲▼'] != 0]
stats = stats[stats['Weight▲▼'] != 0]

### Plot data

In [None]:
stats.columns

In [None]:
stats[['Rider', 'Career points▲▼', 'Height▲▼', 'Weight▲▼']].sort_values(by='Career points▲▼', ascending=False).head()

In [None]:
stats[['Rider', 'Height▲▼', 'Weight▲▼']].plot(kind='scatter', x='Height▲▼', y='Weight▲▼')
plt.show()

In [None]:
df_sorted['Rider']

In [None]:
for i, x in df_sorted.iterrows():
    print(x)
    break

In [None]:
df_sorted = stats[['Rider', 'Career points▲▼', 'Height▲▼', 'Weight▲▼']].sort_values(by='Career points▲▼', ascending=False).iloc[:10]
df_sorted.plot(kind='scatter', x='Height▲▼', y='Weight▲▼')

for i, x in df_sorted.iterrows():
    plt.text(x=x['Height▲▼'], y=x['Weight▲▼'], s=x['Rider'])
    
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14,14))

df_sorted = stats[['Rider', 'Career points▲▼', 'Height▲▼', 'Weight▲▼']].sort_values(by='Career points▲▼', ascending=False)
df_sorted.plot(kind='scatter', x='Height▲▼', y='Weight▲▼', ax=ax)

for i, x in df_sorted.iterrows():
    ax.text(x=x['Height▲▼'], y=x['Weight▲▼'], s=x['Rider'])
    
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14,14))

df_sorted = stats[['Rider', 'Career points▲▼', 'Height▲▼', 'Weight▲▼']].sort_values(by='Career points▲▼', ascending=False)
df_sorted.plot(kind='scatter', x='Height▲▼', y='Weight▲▼', ax=ax)
    
plt.show()

# K-Means clustering

In [None]:
import seaborn as sns

In [None]:
from sklearn.cluster import KMeans

### Height and Weight

In [None]:
X = stats[['Height▲▼', 'Weight▲▼']]
kmeans = KMeans(n_clusters=5, random_state=0).fit(X)

In [None]:
clusters = stats[['Rider', 'Height▲▼', 'Weight▲▼']].copy()
clusters['cluster'] = kmeans.labels_

In [None]:
sns.catplot(x='Height▲▼',
            y='Weight▲▼',
            data=clusters[['Height▲▼', 'Weight▲▼', 'cluster']],
            hue='cluster')

#df_clusters_groupby = clusters.groupby('cluster').first()
#for i, x in df_clusters_groupby.iterrows():
#    plt.text(x=x['Height▲▼'], y=x['Weight▲▼'], s=x['Rider'])

plt.xticks(rotation=90)
plt.show()

In [None]:
kmeans.cluster_centers_

### Top 20 vs bottom 20 - PCS career points

In [None]:
X_1 = stats[['Career points▲▼', 'Weight▲▼']].sort_values(by='Career points▲▼').iloc[:20]
X_2 = stats[['Career points▲▼', 'Weight▲▼']].sort_values(by='Career points▲▼').iloc[-20:]
X = pd.concat([X_1, X_2])
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

In [None]:
clusters = X.copy()
clusters['cluster'] = kmeans.labels_
clusters['Rider'] = stats.loc[X.index]['Rider']
clusters.head()

In [None]:
sns.catplot(x='Career points▲▼',
            y='Weight▲▼',
            data=clusters[['Career points▲▼', 'Weight▲▼', 'cluster']],
            hue='cluster')

#df_clusters_groupby = clusters.groupby('cluster').first()
#for i, x in df_clusters_groupby.iterrows():
#    plt.text(x=x['Height▲▼'], y=x['Weight▲▼'], s=x['Rider'])

plt.xticks(rotation=90)
plt.show()

### Model with all features (kpis) with 3 classes (low, intermediate, high)

In [None]:
X = stats[['Career points▲▼', 'GC points▲▼', 'Classic points▲▼',
       'TT points▲▼', 'Climbers points▲▼', 'Sprinters points▲▼', 'Height▲▼',
       'Weight▲▼']]
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)

clusters = stats[['Rider', 'Career points▲▼', 'GC points▲▼', 'Classic points▲▼',
       'TT points▲▼', 'Climbers points▲▼', 'Sprinters points▲▼', 'Height▲▼',
       'Weight▲▼']].copy()
clusters['cluster'] = kmeans.labels_

In [None]:
from sklearn.manifold import TSNE
import plotly.express as px

In [None]:
features = clusters[['Career points▲▼', 'GC points▲▼', 'Classic points▲▼',
       'TT points▲▼', 'Climbers points▲▼', 'Sprinters points▲▼', 'Height▲▼',
       'Weight▲▼', 'cluster']]

tsne = TSNE(n_components=2, random_state=0)
projections = tsne.fit_transform(features)

fig = px.scatter(
    projections, x=0, y=1,
    color=features.cluster, labels={'color': 'cluster'}
)
fig.show()

In [None]:
clusters.groupby('cluster').head(4)

# Predict rider potential based on model

# Radar Chart