# Checks of races data

In [60]:
# Autoreload allows the notebook to dynamically load code: if we update some helper functions *outside* of the notebook, we do not need to reload the notebook.
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
import pandas as pd
import numpy as np
import re
import unicodedata

We load the dataset from a CSV file and display the first few rows to get an initial understanding of the data. This helps us verify that the data has been loaded correctly and gives us a glimpse of its structure and contents.

In [62]:
csv_file = "../data/races.csv"
dataset = pd.read_csv(csv_file)
dataset.head() 

Unnamed: 0,_url,name,points,uci_points,length,climb_total,profile,startlist_quality,average_temperature,date,position,cyclist,cyclist_age,is_tarmac,is_cobbled,is_gravel,cyclist_team,delta
0,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,0,sean-kelly,22.0,True,False,False,vini-ricordi-pinarello-sidermec-1986,0.0
1,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,1,gerrie-knetemann,27.0,True,False,False,norway-1987,0.0
2,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,2,rene-bittinger,24.0,True,False,False,,0.0
3,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,3,joseph-bruyere,30.0,True,False,False,navigare-blue-storm-1993,0.0
4,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,4,sven-ake-nilsson,27.0,True,False,False,spain-1991,0.0


Create a dataset without the personal information of the cyclists, taking only one row per race

In [63]:
# Delete 'position', 'cyclist', 'cyclist_age', 'cyclist_team' and 'delta' columns
races_info = dataset.drop(columns=['position', 'cyclist', 'cyclist_age', 'cyclist_team', 'delta'])

# For each row in 'races_info', take only the year-month-day part of 'date' (delete the time)
races_info['date'] = races_info['date'].str.split(' ').str[0]

# Eliminate duplicates
races_info = races_info.drop_duplicates()

# Display the first rows of the dataset
races_info.head()

Unnamed: 0,_url,name,points,uci_points,length,climb_total,profile,startlist_quality,average_temperature,date,is_tarmac,is_cobbled,is_gravel
0,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05,True,False,False
106,vuelta-a-espana/2016/stage-14,Vuelta a España,80.0,100.0,196000.0,5575.0,5.0,821,,2016-09-03,True,False,False
271,tour-de-france/2019/stage-21,Tour de France,100.0,120.0,128000.0,781.0,1.0,1699,,2019-07-28,True,False,False
426,volta-a-catalunya/1999/prologue,Volta Ciclista a Catalunya,50.0,,8100.0,,,804,,1999-06-17,True,False,False
545,tour-de-france/2022/stage-9,Tour de France,100.0,120.0,192900.0,3743.0,3.0,1551,24.0,2022-07-10,True,False,False


Create dataset from the union of the cyclists and the races data 

In [64]:
# Create union of two datasets, merging them considering the url of the cyclist
dataset_cyclists = pd.read_csv("../data/cyclists.csv")
dataset_races = pd.read_csv("../data/races.csv")
merged_dataset = pd.merge(dataset_cyclists, dataset_races, left_on='_url', right_on='cyclist', how='inner')

# Modify name column of the cyclist url in '_url_cyclist', and name column of the race url in '_url_race'
merged_dataset = merged_dataset.rename(columns={'_url_x': '_url_cyclist', '_url_y': '_url_race'})
# Modify name column of the cyclist name in 'name_cyclist', and name column of the race name in 'name_race'
merged_dataset = merged_dataset.rename(columns={'name_x': 'name_cyclist', 'name_y': 'name_race'})
# Take only the year-month-day part of 'date' (delete the time)
merged_dataset['date'] = merged_dataset['date'].str.split(' ').str[0]

merged_dataset.head()


Unnamed: 0,_url_cyclist,name_cyclist,birth_year,weight,height,nationality,_url_race,name_race,points,uci_points,...,average_temperature,date,position,cyclist,cyclist_age,is_tarmac,is_cobbled,is_gravel,cyclist_team,delta
0,bruno-surra,Bruno Surra,1964.0,,,Italy,vuelta-a-espana/1989/stage-1,Vuelta a España,80.0,,...,,1989-04-24,110,bruno-surra,25.0,True,False,False,,15.0
1,gerard-rue,Gérard Rué,1965.0,74.0,182.0,France,tour-de-france/1997/stage-2,Tour de France,100.0,,...,,1997-07-07,132,gerard-rue,32.0,True,False,False,denmark-1991,0.0
2,gerard-rue,Gérard Rué,1965.0,74.0,182.0,France,tour-de-france/1990/stage-1,Tour de France,100.0,,...,,1990-07-01,66,gerard-rue,25.0,True,False,False,france-1978,635.0
3,gerard-rue,Gérard Rué,1965.0,74.0,182.0,France,tour-de-france/1992/stage-7,Tour de France,100.0,,...,,1992-07-11,35,gerard-rue,27.0,True,False,False,france-1978,65.0
4,gerard-rue,Gérard Rué,1965.0,74.0,182.0,France,tour-de-france/1990/stage-9,Tour de France,100.0,,...,,1990-07-09,41,gerard-rue,25.0,True,False,False,france-1978,37.0


## Initial Info

Now we provide a concise summary of the DataFrame, including the number of non-null entries, data types of each column, and memory usage. It helps us quickly identify missing values and understand the overall structure of the dataset.

In [65]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589865 entries, 0 to 589864
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   _url                 589865 non-null  object 
 1   name                 589865 non-null  object 
 2   points               589388 non-null  float64
 3   uci_points           251086 non-null  float64
 4   length               589865 non-null  float64
 5   climb_total          442820 non-null  float64
 6   profile              441671 non-null  float64
 7   startlist_quality    589865 non-null  int64  
 8   average_temperature  29933 non-null   float64
 9   date                 589865 non-null  object 
 10  position             589865 non-null  int64  
 11  cyclist              589865 non-null  object 
 12  cyclist_age          589752 non-null  float64
 13  is_tarmac            589865 non-null  bool   
 14  is_cobbled           589865 non-null  bool   
 15  is_gravel        

Also, we generates a descriptive statistics for numerical columns in the DataFrame. It includes metrics such as count, mean, standard deviation, minimum, and maximum values, as well as the 25th, 50th, and 75th percentiles. This summary helps us understand the distribution and central tendency of the data.

In [66]:
dataset.describe()

Unnamed: 0,points,uci_points,length,climb_total,profile,startlist_quality,average_temperature,position,cyclist_age,delta
count,589388.0,251086.0,589865.0,442820.0,441671.0,589865.0,29933.0,589865.0,589752.0,589865.0
mean,89.221635,74.601547,166776.180584,2330.469215,2.611611,1101.161178,21.731768,74.219491,28.486208,418.292794
std,54.43533,100.947962,64545.605664,1375.710722,1.491741,380.586928,5.884761,48.404023,3.855631,842.961596
min,18.0,6.0,1000.0,2.0,1.0,115.0,10.0,0.0,13.0,-6906.0
25%,50.0,16.0,152500.0,1309.0,1.0,844.0,17.0,32.0,26.0,10.0
50%,80.0,60.0,178200.0,2255.0,2.0,988.0,22.0,70.0,28.0,156.0
75%,100.0,100.0,203500.0,3273.0,4.0,1309.0,26.0,112.0,31.0,624.0
max,350.0,800.0,338000.0,6974.0,5.0,2047.0,36.0,209.0,56.0,61547.0


## Check on '_url' data

We start considering the `_url` column, and check the number of null values and the count the occurrences of each unique value

In [67]:
print('Total number of null values in _url column: ' + str(dataset['_url'].isnull().sum())
      + ' (' + str(round(dataset['_url'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in _url column:')
url_counts = dataset['_url'].value_counts()
print(url_counts)

Total number of null values in _url column: 0 (0.0%)

Count occurrences of each value in _url column:
_url
tour-de-france/1986/stage-1     210
tour-de-france/1986/prologue    210
tour-de-france/1987/prologue    207
giro-d-italia/2011/stage-2      206
tour-de-france/1987/stage-1     206
                               ... 
il-lombardia/1972/result          1
giro-d-italia/2001/stage-14       1
tour-de-suisse/1972/stage-2       1
tour-de-suisse/1972/stage-7       1
tour-de-suisse/1972/stage-6b      1
Name: count, Length: 5281, dtype: int64


We have lots of different values, but no null values

In this block we check if there are `_url` values that are not in the form name/year/stage

In [68]:
# Split url by / in name, year and stage
url_split = dataset['_url'].str.split('/', expand=True)
# Check null elements in url_split[0], url_split[1] and url_split[2], and if url_split[1] contains only digits
invalid_rows = dataset[url_split[0].isnull() | url_split[1].isnull() | url_split[2].isnull() | ~url_split[1].str.isdigit()]
print('Number of invalid URLs: ' + str(len(invalid_rows)))

Number of invalid URLs: 0


In [69]:
#TODO: Chiedere sulle gare dove ci sono un solo o pochi partecipanti

url_counts = dataset['_url'].value_counts()

# From url_counts, get the urls where the number of occurrence is less than 2
print(url_counts[url_counts < 6])

_url
tour-de-romandie/1992/stage-6     5
dauphine/1983/prologue            5
tour-de-romandie/1989/stage-1     5
tirreno-adriatico/1996/stage-7    5
volta-a-catalunya/1992/stage-2    5
                                 ..
il-lombardia/1972/result          1
giro-d-italia/2001/stage-14       1
tour-de-suisse/1972/stage-2       1
tour-de-suisse/1972/stage-7       1
tour-de-suisse/1972/stage-6b      1
Name: count, Length: 146, dtype: int64


## Check on 'name' data

Now we consider the `name` column, and check the number of null values and the count the occurrences of each unique value

In [70]:
print('Total number of null values in name column: ' + str(dataset['name'].isnull().sum())
      + ' (' + str(round(dataset['name'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\n\nCount occurrences of each value in name column:')
name_counts = dataset['name'].value_counts()
print(name_counts)

Total number of null values in name column: 0 (0.0%)


Count occurrences of each value in name column:
name
Tour de France                     145500
Giro d'Italia                       95581
Vuelta a España                     89222
Tour de Suisse                      33682
Paris - Nice                        32362
                                    ...  
E3 Saxo Classic                       101
E3 BinckBank Classic                   99
E3 Prijs Vlaanderen - Harelbeke        98
Clásica Ciclista San Sebastian         84
Clásica San Sebastián                  52
Name: count, Length: 61, dtype: int64


We have different values, but no null values

Since it's small, we print all the values

In [71]:
# Print all the names that appear, alphabetically ordered
print(name_counts.index.sort_values())

Index(['Amstel Gold Race', 'Clasica Ciclista San Sebastian',
       'Clásica Ciclista San Sebastian', 'Clásica Ciclista San Sebastián',
       'Clásica San Sebastián', 'Criterium du Dauphiné',
       'Criterium du Dauphiné Libére', 'Critérium du Dauphiné',
       'Critérium du Dauphiné Libéré', 'Donostia San Sebastian Klasikoa',
       'Dwars door België / À travers la Belgique', 'Dwars door Vlaanderen',
       'Dwars door Vlaanderen - A travers la Flandre ME',
       'Dwars door Vlaanderen / A travers la Flandre',
       'Dwars door Vlaanderen / A travers la Flandre ME',
       'E3 BinckBank Classic', 'E3 Harelbeke', 'E3 Prijs Vlaanderen',
       'E3 Prijs Vlaanderen - Harelbeke', 'E3 Saxo Bank Classic',
       'E3 Saxo Classic', 'E3-Prijs Harelbeke', 'Giro d'Italia',
       'Giro di Lombardia', 'Gran Camiño', 'Grand Prix Cycliste de Montréal',
       'Grand Prix Cycliste de Quebec', 'Grand Prix Cycliste de Québec',
       'Il Lombardia', 'Itzulia Basque Country', 'La Flèche Wallonne'

For each url, check if all the `name` values are the same


In [72]:
# Group by '_url' and calculate the number of unique values in the 'name' column
name_uniques = dataset.groupby('_url')['name'].nunique()
# Filter the URLs with more than one unique name
multiple_names_urls = name_uniques[name_uniques > 1].index

print('Number of URLs with more than one unique name: ' + str(len(multiple_names_urls)))

Number of URLs with more than one unique name: 0


In this block we are checking if there are `name` values that contains any incorrect numbers

In [73]:
# Get rows wehere 'name' value contains any number, except for names containing 'E3' (there are some races with E3 in the name)
invalid_rows = dataset[dataset['name'].str.contains(r'\d') & ~dataset['name'].str.contains('E3')]
print('Number of invalid names: ' + str(len(invalid_rows)))

Number of invalid names: 0


## Check on 'points' data

Now we consider the `point` column, and check the number of null values and the count the occurrences of each unique value

In [74]:
print('Total number of null values in points column: ' + str(dataset['points'].isnull().sum())
      + ' (' + str(round(dataset['points'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in points column:')
point_counts = dataset['points'].value_counts()
print(point_counts)

Total number of null values in points column: 477 (0.08%)

Count occurrences of each value in points column:
points
80.0     198878
50.0     186102
100.0    141706
275.0     22299
225.0     19536
125.0      5992
30.0       4313
350.0      3917
70.0       3299
75.0       1963
20.0        792
18.0        292
35.0        183
150.0       116
Name: count, dtype: int64


We have not a lot of values, and few null values. Also, we see that every value is sintatically correct

For each url, check if all the `name` values are the same

In [75]:
# Group by '_url' and calculate the number of unique values in the 'points' column
points_uniques = dataset.groupby('_url')['points'].nunique()
# Filter the URLs with more than one unique points
multiple_points_urls = points_uniques[points_uniques > 1].index

print('Number of URLs with more than one unique points: ' + str(len(multiple_points_urls)))

Number of URLs with more than one unique points: 0


We check the urls where `points` is null

In [76]:
# Get unique data based on '_url' and 'points'
unique_data = dataset.drop_duplicates(subset=['_url', 'points'])
# Get rows where 'points' is null
rows = unique_data[unique_data['points'].isnull()]

print('Number of rows with null points: ' + str(len(rows)))
print('\nURLs of the rows with null points:')
print(rows['_url'])

Number of rows with null points: 4

URLs of the rows with null points:
156755    vuelta-a-espana/1994/stage-5
461300    tour-de-france/1986/stage-19
517517    tour-de-france/1988/prologue
561313    tour-de-france/2019/stage-19
Name: _url, dtype: object


## Check on 'uci_points' data

Now we consider the `uci_points` column, and check the number of null values and the count the occurrences of each unique value

In [77]:
print('Total number of null values in uci_points column: ' + str(dataset['uci_points'].isnull().sum())
      + ' (' + str(round(dataset['uci_points'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in uci_points column:')
uci_point_counts = dataset['uci_points'].value_counts()
print(uci_point_counts)

Total number of null values in uci_points column: 338779 (57.43%)

Count occurrences of each value in uci_points column:
uci_points
100.0    47640
6.0      43390
16.0     41103
60.0     39317
20.0     21303
120.0    20785
50.0     13266
500.0     6096
40.0      4102
210.0     3449
180.0     3103
400.0     2571
300.0     2057
14.0       792
600.0      675
800.0      514
200.0      338
80.0       328
10.0       148
402.0      109
Name: count, dtype: int64


We have different values, but a lot of null values. Also, we see that every value is sintatically correct

For each url, check if all the `uci_points` values are the same

In [78]:
# Group by '_url' and calculate the number of unique values in the 'uci_points' column
uci_points_uniques = dataset.groupby('_url')['uci_points'].nunique()
# Filter the URLs with more than one unique uci_points
multiple_uci_points_urls = uci_points_uniques[uci_points_uniques > 1].index

print('Number of URLs with more than one unique uci_points: ' + str(len(multiple_uci_points_urls)))

Number of URLs with more than one unique uci_points: 0


We check the urls where `uci_points` is null

In [79]:
# Get unique data based on '_url' and 'uci_points'
unique_data = dataset.drop_duplicates(subset=['_url', 'uci_points'])
# Get rows where 'uci_points' is null
rows = unique_data[unique_data['uci_points'].isnull()]

print('Number of rows with null uci_points: ' + str(len(rows)))
print('\nURLs of the rows with null uci_points:')
print(rows['_url'])

Number of rows with null uci_points: 3682

URLs of the rows with null uci_points:
0             tour-de-france/1978/stage-6
426       volta-a-catalunya/1999/prologue
866          tour-de-france/1978/stage-14
1075      volta-a-catalunya/1981/stage-2b
1084             paris-nice/1994/stage-8b
                       ...               
588250              gp-quebec/2010/result
588712       tour-de-france/1982/stage-21
588837     tirreno-adriatico/1993/stage-3
589397          paris-roubaix/2000/result
589463            paris-nice/1976/stage-2
Name: _url, Length: 3682, dtype: object


## Check on 'length' data

Now we consider the `length` column, and check the number of null values and the count the occurrences of each unique value

In [80]:
print('Total number of null values in length column: ' + str(dataset['length'].isnull().sum())
      + ' (' + str(round(dataset['length'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in length column:')
length_counts = dataset['length'].value_counts()
print(length_counts)

Total number of null values in length column: 0 (0.0%)

Count occurrences of each value in length column:
length
177000.0    5039
170000.0    4717
195000.0    4413
200000.0    4401
178000.0    4286
            ... 
4500.0         5
107000.0       5
123100.0       5
142700.0       5
2000.0         3
Name: count, Length: 1280, dtype: int64


We have a lot of values, but no null values

For each url, check if all the `length` values are the same

In [81]:
# Group by '_url' and calculate the number of unique values in the 'length' column
length_uniques = dataset.groupby('_url')['length'].nunique()
# Filter the URLs with more than one unique length
multiple_length_urls = length_uniques[length_uniques > 1].index

print('Number of URLs with more than one unique length: ' + str(len(multiple_length_urls)))

Number of URLs with more than one unique length: 0


Since we have a lot of values, we check if every value is sintatically correct

In [82]:
# Get rows where 'length' is not a digit
invalid_rows = dataset[~dataset['length'].astype(str).str.replace('.', '').str.isdigit()]

print('Number of invalid lengths: ' + str(len(invalid_rows)))

Number of invalid lengths: 0


In [83]:
# Get rows where 'length' does not end with '.0'
invalid_rows = dataset[~dataset['length'].astype(str).str.endswith('.0')]
                                
print('Number of invalid lengths: ' + str(len(invalid_rows)))
for index, row in invalid_rows.iterrows():
    print(row['_url'], row['length'])

Number of invalid lengths: 954
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/20

Check the races where the `length` value is small or large, for possible outliers

In [84]:
# Dataset info, for 'length' column
dataset['length'].describe()

count    589865.000000
mean     166776.180584
std       64545.605664
min        1000.000000
25%      152500.000000
50%      178200.000000
75%      203500.000000
max      338000.000000
Name: length, dtype: float64

In [85]:
n = 2000
# Get rows where 'length' is smaller than n
filtered_data = races_info[races_info['length'] < n]

print('Rows where length is smaller than ' + str(n) + ':')
for index, row in filtered_data.iterrows():
    # Stampa '_url' e i corrispondenti 'length'
    print(index, row['_url'], row['length'])

Rows where length is smaller than 2000:
185494 tour-de-romandie/2008/prologue 1900.0
270557 tour-de-suisse/1991/prologue 1700.0
322394 paris-nice/1972/prologue 1700.0
488565 giro-d-italia/2005/prologue 1150.0
515667 paris-nice/1975/prologue 1700.0
517517 tour-de-france/1988/prologue 1000.0


In [86]:
n = 300000
# Get data where 'length' is greater than n
filtered_data = races_info[races_info['length'] > n]

print('Rows where length is greater than ' + str(n) + ':')
for index, row in filtered_data.iterrows():
    # Stampa '_url' e i corrispondenti 'length'
    print(index, row['_url'], row['length'])

Rows where length is greater than 300000:
34136 milano-sanremo/2020/result 305000.0
539045 tour-de-france/1984/stage-9 338000.0
583898 tour-de-france/1984/stage-21 320500.0
585665 tour-de-france/1990/stage-5 301000.0


## Check on 'climb_total' data

Now we consider the `climb_total` column, and check the number of null values and the count the occurrences of each unique value

In [87]:
print('Total number of null values in climb_total column: ' + str(dataset['climb_total'].isnull().sum())
      + ' (' + str(round(dataset['climb_total'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in climb_total column:')
climb_total_counts = dataset['climb_total'].value_counts()
print(climb_total_counts)

Total number of null values in climb_total column: 147045 (24.93%)

Count occurrences of each value in climb_total column:
climb_total
3500.0    3762
2500.0    3261
4000.0    3029
3000.0    2938
5000.0    2592
          ... 
3742.0       9
2525.0       9
1176.0       8
1903.0       5
3128.0       1
Name: count, Length: 2117, dtype: int64


We have a lot of different values, and a lot of null values

For each url, check if all the `climb_total` values are the same


In [88]:
# Group by '_url' and calculate the number of unique values in the 'climb_total' column
climb_total_uniques = dataset.groupby('_url')['climb_total'].nunique()
# Filter the URLs with more than one unique climb_total
multiple_climb_total_urls = climb_total_uniques[climb_total_uniques > 1].index

print('Number of URLs with more than one unique climb_total: ' + str(len(multiple_climb_total_urls)))

Number of URLs with more than one unique climb_total: 0


Since we have a lot of values, we check if every value is sintatically correct

In [89]:
# Get rows where 'climb_total' is not a digit
invalid_rows = dataset[~dataset['climb_total'].astype(str).str.replace('.', '').str.isdigit()].dropna(subset=['climb_total'])

print('Number of invalid climb_total: ' + str(len(invalid_rows)))

Number of invalid climb_total: 0


In [90]:
# Get rows where 'climb_total' does not end with '.0'
invalid_rows = dataset[~dataset['climb_total'].astype(str).str.endswith('.0')].dropna(subset=['climb_total'])

print('Number of invalid climb_total: ' + str(len(invalid_rows)))

Number of invalid climb_total: 0


Check the races where the `climb_total` value is small or large, for possible outliers

In [91]:
# Dataset info, for 'climb_total' column
dataset['climb_total'].describe()

count    442820.000000
mean       2330.469215
std        1375.710722
min           2.000000
25%        1309.000000
50%        2255.000000
75%        3273.000000
max        6974.000000
Name: climb_total, dtype: float64

In [92]:
n = 5
# Get rows where 'climb_total' is smaller than n
filtered_data = races_info[races_info['climb_total'] < n]

print('Rows where climb_total is smaller than ' + str(n) + ':')
for index, row in filtered_data.iterrows():
    # Stampa '_url' e i corrispondenti 'climb_total'
    print(index, row['_url'], row['climb_total'])

Rows where climb_total is smaller than 5:
19745 giro-d-italia/2020/stage-21 3.0
139914 tirreno-adriatico/2023/stage-1 3.0
334667 tirreno-adriatico/2015/stage-1 2.0
338904 tirreno-adriatico/2022/stage-1 4.0


In [93]:
n = 6000
# Get data where 'climb_total' is greater than n
filtered_data = races_info[races_info['climb_total'] > n]

print('Rows where climb_total is greater than ' + str(n) + ':')
for index, row in filtered_data.iterrows():
    # Stampa '_url' e i corrispondenti 'climb_total'
    print(index, row['_url'], row['climb_total'])

Rows where climb_total is greater than 6000:
56822 giro-d-italia/2011/stage-15 6939.0
70884 tour-de-france/1983/stage-18 6589.0
76202 giro-d-italia/2016/stage-14 6001.0
162430 giro-d-italia/2012/stage-20 6068.0
185164 tour-de-france/2007/stage-16 6031.0
205119 tour-de-france/2000/stage-14 6425.0
284292 tour-de-france/1992/stage-13 6974.0
323636 tour-de-france/1978/stage-17 6500.0
337298 volta-a-catalunya/1995/stage-4 6687.0
520518 tour-de-france/1983/stage-17 6500.0
554148 volta-a-catalunya/1996/stage-4 6317.0


## Check on 'profile' data

Now we consider the `profile` column, and check the number of null values and the count the occurrences of each unique value

In [94]:
print('Total number of null values in profile column: ' + str(dataset['profile'].isnull().sum())
      + ' (' + str(round(dataset['profile'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in profile column:')
profile_counts = dataset['profile'].value_counts()
print(profile_counts)

Total number of null values in profile column: 148194 (25.12%)

Count occurrences of each value in profile column:
profile
1.0    131344
2.0    128269
5.0     88203
3.0     50844
4.0     43011
Name: count, dtype: int64


We have few different values, but a lot of null values. Also, we see that every value is sintatically correct

For each url, check if all the `profile` values are the same

In [95]:
# Group by '_url' and calculate the number of unique values in the 'profile' column
profile_uniques = dataset.groupby('_url')['profile'].nunique()
# Filter the URLs with more than one unique profile
multiple_profile_urls = profile_uniques[profile_uniques > 1].index

print('Number of URLs with more than one unique profile: ' + str(len(multiple_profile_urls)))

Number of URLs with more than one unique profile: 0


## Check on 'startlist_quality' data

Now we consider the `startlist_quality` column, and check the number of null values and the count the occurrences of each unique value

In [96]:
print('Total number of null values in startlist_quality column: ' + str(dataset['startlist_quality'].isnull().sum())
      + ' (' + str(round(dataset['startlist_quality'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in startlist_quality column:')
startlist_quality_counts = dataset['startlist_quality'].value_counts()
print(startlist_quality_counts)

Total number of null values in startlist_quality column: 0 (0.0%)

Count occurrences of each value in startlist_quality column:
startlist_quality
971     8279
1812    7807
1872    7317
1612    7255
920     6715
        ... 
455        5
438        5
394        5
228        3
544        3
Name: count, Length: 697, dtype: int64


We have different values, but no null values.

For each url, check if all the `startlist_quality` values are the same

In [97]:
# Group by '_url' and calculate the number of unique values in the 'startlist_quality' column
startlist_quality_uniques = dataset.groupby('_url')['startlist_quality'].nunique()
# Filter the URLs with more than one unique startlist_quality
multiple_startlist_quality_urls = startlist_quality_uniques[startlist_quality_uniques > 1].index

print('Number of URLs with more than one unique startlist_quality: ' + str(len(multiple_startlist_quality_urls)))

Number of URLs with more than one unique startlist_quality: 0


Since we have a lot of different values, we check if every value is sintatically correct

In [98]:
# Get rows where 'startlist_quality' is not a digit
invalid_rows = dataset[~dataset['startlist_quality'].astype(str).str.replace('.', '').str.isdigit()]

print('Number of invalid startlist_quality: ' + str(len(invalid_rows))) 

Number of invalid startlist_quality: 0


## Check on 'average_temperature' data

Now we consider the `average_temperature` column, and check the number of null values and the count the occurrences of each unique value

In [99]:
print('Total number of null values in average_temperature column: ' + str(dataset['average_temperature'].isnull().sum())
      + ' (' + str(round(dataset['average_temperature'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in average_temperature column:')
average_temperature_counts = dataset['average_temperature'].value_counts()
print(average_temperature_counts)

Total number of null values in average_temperature column: 559932 (94.93%)

Count occurrences of each value in average_temperature column:
average_temperature
23.0    2343
22.0    2099
20.0    2055
24.0    1829
25.0    1789
14.0    1499
21.0    1463
26.0    1440
18.0    1435
13.0    1386
15.0    1295
16.0    1213
17.0    1170
28.0    1149
19.0    1080
29.0    1038
31.0    1026
27.0     990
30.0     769
11.0     731
32.0     730
12.0     548
36.0     265
35.0     167
33.0     165
34.0     133
10.0     126
Name: count, dtype: int64


We have different values, and almost all the values are null. Also, we cas see that every value is sintatically correct

For each url, check if all the `average_temperature` values are the same

In [100]:
# Group by '_url' and calculate the number of unique values in the 'average_temperature' column
average_temperature_uniques = dataset.groupby('_url')['average_temperature'].nunique()
# Filter the URLs with more than one unique average_temperature
multiple_average_temperature_urls = average_temperature_uniques[average_temperature_uniques > 1].index

print('Number of URLs with more than one unique average_temperature: ' + str(len(multiple_average_temperature_urls)))

Number of URLs with more than one unique average_temperature: 0


## Check on 'date' data

Now we consider the `date` column, and check the number of null values and the count the occurrences of each unique value. We do this considerig the merged dataset where we don't have time value of the date

In [101]:
print('Total number of null values in date column: ' + str(merged_dataset['date'].isnull().sum())
      + ' (' + str(round(merged_dataset['date'].isnull().sum() / len(merged_dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in date column:')
date_counts = merged_dataset['date'].value_counts()
print(date_counts)

Total number of null values in date column: 0 (0.0%)

Count occurrences of each value in date column:
date
1998-06-18    407
1987-07-05    404
2006-05-15    379
2006-05-16    377
2023-02-25    376
             ... 
1972-06-23      1
1972-06-21      1
1972-06-17      1
1972-06-22      1
2001-06-02      1
Name: count, Length: 4708, dtype: int64


We have different values, but no null values.

For each url, check if all the `date` values are the same

In [102]:
# Group by '_url' and calculate the number of unique values in the 'date' column
date_uniques = dataset.groupby('_url')['date'].nunique()
# Filter the URLs with more than one unique date
multiple_date_urls = date_uniques[date_uniques > 1].index

print('Number of URLs with more than one unique date: ' + str(len(multiple_date_urls)))

Number of URLs with more than one unique date: 4777


Since we have a lot of different values, we check if every value is sintatically correct

In [103]:
# Check if there are any 'date' values in the format yyyy-mm-dd hh:mm:ss (in the races dataset)
invalid_rows = dataset[~dataset['date'].str.match(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')]
print('Number of invalid dates (format yyyy-mm-dd hh:mm:ss, in the races dataset): ' + str(len(invalid_rows)))

# Check if there are any 'date' values not in the format yyyy-mm-dd (in the merged dataset)
invalid_rows = merged_dataset[~merged_dataset['date'].str.match(r'\d{4}-\d{2}-\d{2}')]
print('Number of invalid dates (format yyyy-mm-dd, in the merged dataset): ' + str(len(invalid_rows)))

Number of invalid dates (format yyyy-mm-dd hh:mm:ss, in the races dataset): 0
Number of invalid dates (format yyyy-mm-dd, in the merged dataset): 0


Check if the year is the same in both the `_url` and the `date`

In [104]:
# Split _url by / into name, year, and stage
url_split = dataset['_url'].str.split('/', expand=True) # expand=True to return a DataFrame
# Extract the year from the date column (assuming format yyyy-mm-dd hh:mm:ss)
date_year = dataset['date'].str[:4]
# Compare the year in the _url (from the second part of the split) with the year in the date
mismatched_years = dataset[(url_split[1] != date_year)]

# Print the number of rows where the year does not match
print(f"Number of rows where the year in the url does not match the year in the date: {len(mismatched_years)}")


Number of rows where the year in the url does not match the year in the date: 0


## Check on 'position' data

Now we consider the `position` column, and check the number of null values and the count the occurrences of each unique value

In [105]:
print('Total number of null values in position column: ' + str(dataset['position'].isnull().sum())
      + ' (' + str(round(dataset['position'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in position column:')
position_counts = dataset['position'].value_counts()
print(position_counts)

Total number of null values in position column: 0 (0.0%)

Count occurrences of each value in position column:
position
0      5281
1      5275
2      5273
3      5267
4      5255
       ... 
205       5
206       3
207       2
208       2
209       2
Name: count, Length: 210, dtype: int64


We have different values, and no null values.

Since we have different values, we check if every value is sintatically correct

In [106]:
# Check if there are any 'position' value that is not a digit
invalid_rows = dataset[~dataset['position'].astype(str).str.replace('.', '').str.isdigit()]

print('Number of invalid positions: ' + str(len(invalid_rows)))

Number of invalid positions: 0


For each url, check if there are all the `position` values

In [107]:
# Function to check if the positions are from 0 to the max one after the other
def check_positions(positions):
    return np.array_equal(np.sort(positions), np.arange(positions.max() + 1))

# Apply the function to the dataset
invalid_urls = dataset.groupby('_url')['position'].apply(lambda x: not check_positions(x))

# Stampa gli '_url' che non rispettano la condizione
print('Number of URLs with invalid positions: ' + str(len(invalid_urls[invalid_urls])))


Number of URLs with invalid positions: 0


## Check on 'cyclist' data

Now we consider the `cyclist` column, and check the number of null values and the count the occurrences of each unique value

In [108]:
print('Total number of null values in cyclist column: ' + str(dataset['cyclist'].isnull().sum())
      + ' (' + str(round(dataset['cyclist'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in cyclist column:')
cyclist_counts = dataset['cyclist'].value_counts()
print(cyclist_counts)

Total number of null values in cyclist column: 0 (0.0%)

Count occurrences of each value in cyclist column:
cyclist
matteo-tosatto         959
alejandro-valverde     942
luis-leon-sanchez      899
imanol-erviti          883
haimar-zubeldia        883
                      ... 
timothy-vangheel         1
matthias-friedemann      1
stefan-rucker            1
john-brouwer             1
stijn-ennekens           1
Name: count, Length: 6095, dtype: int64


We have lots of different values, but no null values.

We check if all the cyclists are different in the same race

In [109]:
# For each url, check if the a cyclist appears more than once
url_cyclist_count = dataset.groupby('_url')['cyclist'].value_counts()
invalid_entries = url_cyclist_count[url_cyclist_count > 1]

print('Number of URLs with a cyclist appearing more than once: ' + str(len(invalid_entries.index.get_level_values(0))))

# Estrai gli _url e i ciclisti che compaiono più volte
for (url, cyclist), count in invalid_entries.items():
    print(f"URL: {url}, Cyclist: {cyclist}, Count: {count}")

Number of URLs with a cyclist appearing more than once: 123
URL: dauphine/2005/stage-1, Cyclist: ivan-gutierrez, Count: 2
URL: dauphine/2005/stage-2, Cyclist: ivan-gutierrez, Count: 2
URL: dauphine/2005/stage-3, Cyclist: ivan-gutierrez, Count: 2
URL: dauphine/2005/stage-4, Cyclist: ivan-gutierrez, Count: 2
URL: dauphine/2005/stage-5, Cyclist: ivan-gutierrez, Count: 2
URL: dauphine/2005/stage-6, Cyclist: ivan-gutierrez, Count: 2
URL: dauphine/2005/stage-7, Cyclist: ivan-gutierrez, Count: 2
URL: dauphine/2012/stage-3, Cyclist: david-moncoutie, Count: 2
URL: itzulia-basque-country/2000/stage-1, Cyclist: alberto-david-fernandez, Count: 2
URL: itzulia-basque-country/2000/stage-2, Cyclist: alberto-david-fernandez, Count: 2
URL: itzulia-basque-country/2000/stage-2, Cyclist: david-vazquez, Count: 2
URL: itzulia-basque-country/2000/stage-2, Cyclist: miguel-angel-pena, Count: 2
URL: itzulia-basque-country/2000/stage-3, Cyclist: miguel-angel-pena, Count: 2
URL: itzulia-basque-country/2000/stage-4

Considering the two datasets, we check if all the cyclists in cyclists.csv are in races.csv, and vice versa.

In [110]:
# Check if there are any 'cyclist' values in the races dataset that are not in the cyclists dataset
invalid_rows = dataset[~dataset['cyclist'].isin(dataset_cyclists['_url'])]

print('Number of cyclists with no info: ' + str(len(invalid_rows)))

Number of cyclists with no info: 0


In [111]:
# Check if there are any '_url' values in the cyclists dataset that are not in the races dataset
invalid_rows = dataset_cyclists[~dataset_cyclists['_url'].isin(dataset['cyclist'])]

print('Number of cyclists that are not in any race: ' + str(len(invalid_rows)))
for index, row in invalid_rows.iterrows():
    print(row['_url'], row['name'])

Number of cyclists that are not in any race: 39
jean-michel-thilloy Jean-Michel  Thilloy
gert-van-brabant Gert Van Brabant
roman-bronis Roman  Broniš
oleg-grishkine Oleg  Grishkine
eddy-torrekens Eddy  Torrekens
philipp-ludescher Philipp  Ludescher
nicolas-liboreau Nicolas  Liboreau
gino-primo Gino  Primo
luca-braidot Luca  Braidot
tanner-putt Tanner  Putt
matteo-di-serafino Matteo Di Serafino
jeanot-deriemaecker Jeanot  Deriemaecker
dorian-de-maeght Dorian De Maeght
martin-gilbert Martin  Gilbert
eric-schoefs Eric  Schoefs
silvere-ackermann Silvère  Ackermann
franck-morelle Franck  Morelle
christian-mager Christian  Mager
rikkie-matthijssens Rikkie  Matthijssens
marat-ganeev Marat  Ganeev
bas-tietema Bas  Tietema
mattia-viel Mattia  Viel
hiroki-nishimura Hiroki  Nishimura
christophe-premont Christophe  Premont
kurt-van-landeghem Kurt van Landeghem
lenaic-olivier Lénaïc  Olivier
arturo-gravalos-lopez Arturo  Grávalos
morten-hegreberg Morten  Hegreberg
rik-claeys Rik  Claeys
pascal-duez

## Check on 'cyclist_age' data

Now we consider the `cyclist_age` column, and check the number of null values and the count the occurrences of each unique value

In [112]:
print('Total number of null values in cyclist_age column: ' + str(dataset['cyclist_age'].isnull().sum())
      + ' (' + str(round(dataset['cyclist_age'].isnull().sum() / dataset.shape[0] * 100, 2)) + '%)')

print('\nCount occurrences of each value in cyclist_age column:')
cyclist_age_counts = dataset['cyclist_age'].value_counts()
print(cyclist_age_counts)

Total number of null values in cyclist_age column: 113 (0.02%)

Count occurrences of each value in cyclist_age column:
cyclist_age
27.0    58897
26.0    57921
28.0    56213
25.0    54346
29.0    52616
30.0    46860
24.0    43252
31.0    40827
32.0    35063
23.0    30009
33.0    27972
34.0    21965
35.0    16413
22.0    13531
36.0    12111
37.0     8141
38.0     4703
21.0     3733
39.0     2470
40.0     1127
20.0      842
41.0      307
42.0      218
43.0       96
19.0       90
45.0       16
44.0       11
56.0        1
13.0        1
Name: count, dtype: int64


We have different values, and just a few of null values. Also, we see that every value is sintatically correct

In [113]:
# For each data, print '_url' where 'cyclist_age' is NaN
for index, row in dataset[dataset['cyclist_age'].isnull()].iterrows():
    print(row['_url'])

giro-d-italia/2019/stage-14
vuelta-a-espana/2020/stage-7
amstel-gold-race/2018/result
vuelta-a-espana/2020/stage-11
paris-nice/2019/stage-3
ronde-van-vlaanderen/1990/result
giro-d-italia/2019/stage-16
la-fleche-wallone/1997/result
vuelta-a-espana/1985/stage-12
vuelta-a-espana/1985/stage-12
vuelta-a-espana/1985/stage-12
vuelta-a-espana/1985/stage-12
vuelta-a-espana/1985/stage-19
vuelta-a-espana/1985/stage-19
vuelta-a-espana/1985/stage-19
vuelta-a-espana/2020/stage-4
giro-d-italia/2019/stage-9
giro-d-italia/2019/stage-18
tour-de-romandie/1993/stage-1
vuelta-a-espana/1985/stage-18
vuelta-a-espana/1985/stage-18
vuelta-a-espana/1985/stage-18
vuelta-a-espana/1985/stage-18
vuelta-a-espana/1985/stage-5
vuelta-a-espana/1985/stage-5
paris-nice/2019/stage-1
giro-d-italia/2019/stage-5
dauphine/2000/prologue
giro-d-italia/2019/stage-3
giro-d-italia/2019/stage-13
vuelta-a-espana/1985/stage-17
vuelta-a-espana/1985/stage-17
vuelta-a-espana/1985/stage-17
vuelta-a-espana/1985/stage-17
vuelta-a-espana/19

Now, we check if there we can take some age information from the dataset

In [114]:
# Check if there are any 'cyclist_age' null values, where we have the year of birth in the cyclists dataset
invalid_rows = merged_dataset[merged_dataset['cyclist_age'].isnull() & merged_dataset['birth_year'].notnull()]

print('Number of cyclists with age info in cyclists dataset: ' + str(len(invalid_rows)))

Number of cyclists with age info in cyclists dataset: 0


In [148]:
# Check for cyclists whose age is partially missing (some values present, some missing)
partial_missing_age = merged_dataset.groupby('_url_cyclist')['cyclist_age'].apply(lambda x: x.isnull().any() and x.notnull().any())

# Print the number of cyclists with partial missing age data
print('Number of cyclists with partially missing age: ' + str(len(partial_missing_age[partial_missing_age])))


Number of cyclists with partially missing age: 0


Check the races where the `cyclist_age` value is small or large, for possible outliers

In [116]:
# Dataset info, for 'cyclist_age' column
dataset['cyclist_age'].describe()

count    589752.000000
mean         28.486208
std           3.855631
min          13.000000
25%          26.000000
50%          28.000000
75%          31.000000
max          56.000000
Name: cyclist_age, dtype: float64

In [117]:
n = 18
# Get rows where 'cyclist_age' is smaller than n
filtered_data = dataset[dataset['cyclist_age'] < n]

print('Rows where cyclist age is smaller than ' + str(n) + ':')
for index, row in filtered_data.iterrows():
    # Stampa '_url' e i corrispondenti 'cyclist_age'
    print(index, row['_url'], row['cyclist_age'])

Rows where cyclist age is smaller than 18:
294489 san-sebastian/2001/result 13.0


In [118]:
n = 50
# Get data where 'cyclist_age' is greater than n
filtered_data = dataset[dataset['cyclist_age'] > n]

print('Rows where cyclist_age is greater than ' + str(n) + ':')
for index, row in filtered_data.iterrows():
    # Stampa '_url' e i corrispondenti 'cyclist_age'
    print(index, row['_url'], row['cyclist_age'])

Rows where cyclist_age is greater than 50:
114525 vuelta-a-espana/1996/stage-1 56.0


## Check on 'is_tarmac' data

Now we consider the `is_tarmac` column, and check the number of null values and the count the occurrences of each unique value

In [119]:
print('Total number of null values in is_tarmac column: ' + str(dataset['is_tarmac'].isnull().sum())
      + ' (' + str(round(dataset['is_tarmac'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in is_tarmac column:')
is_tarmac_counts = dataset['is_tarmac'].value_counts()
print(is_tarmac_counts)

Total number of null values in is_tarmac column: 0 (0.0%)

Count occurrences of each value in is_tarmac column:
is_tarmac
True     536042
False     53823
Name: count, dtype: int64


We have two different values, and no null values. Also, we see that every value is sintatically correct

For each url, check if all the `is_tarmac` values are the same

In [120]:
# Group by '_url' and calculate the number of unique values in the 'is_tarmac' column
is_tarmac_uniques = dataset.groupby('_url')['is_tarmac'].nunique()
# Filter the URLs with more than one unique is_tarmac
multiple_is_tarmac_urls = is_tarmac_uniques[is_tarmac_uniques > 1].index

print('Number of URLs with more than one unique is_tarmac: ' + str(len(multiple_is_tarmac_urls)))

Number of URLs with more than one unique is_tarmac: 0


## Check on 'is_cobbled' data

Now we consider the `is_cobbled` column, and check the number of null values and the count the occurrences of each unique value

In [121]:
print('Total number of null values in is_cobbled column: ' + str(dataset['is_cobbled'].isnull().sum())
      + ' (' + str(round(dataset['is_cobbled'].isnull().sum() / dataset.shape[0] * 100, 2)) + '%)')

print('\nCount occurrences of each value in is_cobbled column:')
is_cobbled_counts = dataset['is_cobbled'].value_counts()
print(is_cobbled_counts)

Total number of null values in is_cobbled column: 0 (0.0%)

Count occurrences of each value in is_cobbled column:
is_cobbled
False    589865
Name: count, dtype: int64


We have one value, and no null values. Also, we see that the value is sintatically correct

## Check on 'is_gravel' data

Now we consider the `is_gravel` column, and check the number of null values and the count the occurrences of each unique value

In [122]:
print('Total number of null values in is_gravel column: ' + str(dataset['is_gravel'].isnull().sum())
      + ' (' + str(round(dataset['is_gravel'].isnull().sum() / dataset.shape[0] * 100, 2)) + '%)')

print('\nCount occurrences of each value in is_gravel column:')
is_gravel_counts = dataset['is_gravel'].value_counts()
print(is_gravel_counts)

Total number of null values in is_gravel column: 0 (0.0%)

Count occurrences of each value in is_gravel column:
is_gravel
False    589865
Name: count, dtype: int64


We have one value, and no null values. Also, we see that the value is sintatically correct

## Check on 'cyclist_team' data

Now we consider the `cyclist_team` column, and check the number of null values and the count the occurrences of each unique value

In [123]:
# Print total number of null values in 'delta' column, and the percentage of null values (float with two decimal digits after the comma)
print('Total number of null values in cyclist_team column: ' + str(dataset['cyclist_team'].isnull().sum())
      + ' (' + str(round(dataset['cyclist_team'].isnull().sum() / dataset.shape[0] * 100, 2)) + '%)')

print('\nCount occurrences of each value in cyclist_team column:')
cyclist_team_counts = dataset['cyclist_team'].value_counts()
print(cyclist_team_counts)

Total number of null values in cyclist_team column: 159161 (26.98%)

Count occurrences of each value in cyclist_team column:
cyclist_team
liberty-seguros-wurth-team-2005     8869
roompot-nederlandse-loterij-2018    8773
chazal-vetta-mbk-1993               8094
kondor-1979                         7895
kazakhstan-2019                     7701
                                    ... 
atala-ofmega-1988                   1259
finland-2016                        1236
south-africa-1993                   1174
denmark-2003                         216
quickstep-innergetic-2009              3
Name: count, Length: 91, dtype: int64


We have different values, and a lot of null values.

Since we have a lot of different values, we check if every value is sintatically correct

In [129]:
# Check if every 'cyclist_team' value matches the required format
invalid_teams = dataset[dataset['cyclist_team'].notnull() & ~dataset['cyclist_team'].astype(str).str.match(r'.+-\d{4}')]

print('Number of invalid rows: ' + str(len(invalid_teams)))

Number of invalid rows: 0


Check if the same cyclist is in two cyclist teams

In [158]:
# Check if the same cyclist is in two cyclist teams
invalid_teams = dataset.groupby('cyclist')['cyclist_team'].nunique()
invalid_teams = invalid_teams[invalid_teams > 1]

print('Number of cyclists in multiple teams: ' + str(len(invalid_teams)))
for cyclist, count in invalid_teams.items():
    print(f"Cyclist: {cyclist}, Count: {count}")

Number of cyclists in multiple teams: 2350
Cyclist: aaron-gate, Count: 2
Cyclist: aaron-van-poucke, Count: 3
Cyclist: aaron-verwilst, Count: 2
Cyclist: abner-gonzalez-rivera, Count: 3
Cyclist: abraham-olano-manzano, Count: 10
Cyclist: ad-wijnands, Count: 9
Cyclist: adam-blythe, Count: 8
Cyclist: adam-de-vos, Count: 3
Cyclist: adam-hansen, Count: 10
Cyclist: adam-yates, Count: 9
Cyclist: addy-engels, Count: 10
Cyclist: adrian-honkisz, Count: 4
Cyclist: adrian-kurek, Count: 3
Cyclist: adrian-palomares, Count: 5
Cyclist: adrian-saez-de-arregi, Count: 2
Cyclist: adriano-baffi, Count: 13
Cyclist: adriano-malori, Count: 5
Cyclist: adrie-van-der-poel, Count: 13
Cyclist: adrien-niyonshuti, Count: 2
Cyclist: adrien-petit, Count: 10
Cyclist: agustin-tamames, Count: 2
Cyclist: aidis-kruopis, Count: 5
Cyclist: aime-de-gendt, Count: 7
Cyclist: aitor-garmendia, Count: 12
Cyclist: aitor-gonzalez-jimenez, Count: 7
Cyclist: alain-santy, Count: 4
Cyclist: alan-jousseaume, Count: 2
Cyclist: alan-lezaun-p

## Check on 'delta' data

Now we consider the `delta` column, and check the number of null values and the count the occurrences of each unique value

In [130]:
print('Total number of null values in delta column: ' + str(dataset['delta'].isnull().sum())
        + ' (' + str(round(dataset['delta'].isnull().sum() / dataset.shape[0] * 100, 2)) + '%)')

print('\nCount occurrences of each value in delta column:')
delta_counts = dataset['delta'].value_counts()
print(delta_counts)

Total number of null values in delta column: 0 (0.0%)

Count occurrences of each value in delta column:
delta
0.0       120546
2.0         3700
5.0         3353
3.0         3178
4.0         3036
           ...  
2453.0         1
2357.0         1
2889.0         1
2718.0         1
2527.0         1
Name: count, Length: 2836, dtype: int64


We have lots of different values, but no null values.

Since we have a lot of different values, we check if every value is sintatically correct

In [131]:
# Check if there are any 'delta' values that do not end with '.0'
invalid_rows = dataset[~dataset['delta'].astype(str).str.endswith('.0')]

print('Number of invalid deltas: ' + str(len(invalid_rows)))

Number of invalid deltas: 0


In [133]:
# Check if there are any 'delta' values that are not a digit
invalid_rows = dataset[~dataset['delta'].astype(str).str.replace('.', '').str.isdigit()]

print('Number of invalid deltas: ' + str(len(invalid_rows)))
for index, row in invalid_rows.iterrows():
    print(row['_url'], row['delta'])

Number of invalid deltas: 86
vuelta-a-espana/1992/stage-19 -2635.0
vuelta-a-espana/1992/stage-19 -2638.0
vuelta-a-espana/1992/stage-19 -2541.0
vuelta-a-espana/1992/stage-19 -2542.0
vuelta-a-espana/1992/stage-19 -2545.0
vuelta-a-espana/1992/stage-19 -2546.0
vuelta-a-espana/1992/stage-19 -2550.0
vuelta-a-espana/1992/stage-19 -2560.0
vuelta-a-espana/1992/stage-19 -2564.0
vuelta-a-espana/1992/stage-19 -2567.0
vuelta-a-espana/1992/stage-19 -2567.0
vuelta-a-espana/1992/stage-19 -2574.0
vuelta-a-espana/1992/stage-19 -2469.0
vuelta-a-espana/1992/stage-19 -2473.0
vuelta-a-espana/1992/stage-19 -2475.0
vuelta-a-espana/1992/stage-19 -2477.0
vuelta-a-espana/1992/stage-19 -2479.0
vuelta-a-espana/1992/stage-19 -2481.0
vuelta-a-espana/1992/stage-19 -2482.0
vuelta-a-espana/1992/stage-19 -2485.0
vuelta-a-espana/1992/stage-19 -2486.0
vuelta-a-espana/1992/stage-19 -2486.0
vuelta-a-espana/1992/stage-19 -2487.0
vuelta-a-espana/1992/stage-19 -2500.0
vuelta-a-espana/1992/stage-19 -2504.0
vuelta-a-espana/1992/

In [134]:
# Check if there are any 'delta' values that are negative
invalid_rows = dataset[dataset['delta'] < 0]

print('Number of negative deltas: ' + str(len(invalid_rows)))
for index, row in invalid_rows.iterrows():
    print(row['_url'], row['delta'])

Number of negative deltas: 86
vuelta-a-espana/1992/stage-19 -2635.0
vuelta-a-espana/1992/stage-19 -2638.0
vuelta-a-espana/1992/stage-19 -2541.0
vuelta-a-espana/1992/stage-19 -2542.0
vuelta-a-espana/1992/stage-19 -2545.0
vuelta-a-espana/1992/stage-19 -2546.0
vuelta-a-espana/1992/stage-19 -2550.0
vuelta-a-espana/1992/stage-19 -2560.0
vuelta-a-espana/1992/stage-19 -2564.0
vuelta-a-espana/1992/stage-19 -2567.0
vuelta-a-espana/1992/stage-19 -2567.0
vuelta-a-espana/1992/stage-19 -2574.0
vuelta-a-espana/1992/stage-19 -2469.0
vuelta-a-espana/1992/stage-19 -2473.0
vuelta-a-espana/1992/stage-19 -2475.0
vuelta-a-espana/1992/stage-19 -2477.0
vuelta-a-espana/1992/stage-19 -2479.0
vuelta-a-espana/1992/stage-19 -2481.0
vuelta-a-espana/1992/stage-19 -2482.0
vuelta-a-espana/1992/stage-19 -2485.0
vuelta-a-espana/1992/stage-19 -2486.0
vuelta-a-espana/1992/stage-19 -2486.0
vuelta-a-espana/1992/stage-19 -2487.0
vuelta-a-espana/1992/stage-19 -2500.0
vuelta-a-espana/1992/stage-19 -2504.0
vuelta-a-espana/1992

Check if following the `positon` order, the delta is ordered too

In [156]:
# Grouped by '_url', check if the 'delta' value is greater or equal than the previous one (the one in the previous row, except for the first one)
invalid_urls = dataset.groupby('_url')['delta'].apply(lambda x: (x.shift() > x).any())

# Print the number of URLs with invalid 'delta' values
print('Number of URLs with invalid deltas: ' + str(len(invalid_urls[invalid_urls])))
for url in invalid_urls[invalid_urls].index:
    print(url)

Number of URLs with invalid deltas: 346
dauphine/2006/stage-2
dauphine/2007/stage-7
dauphine/2008/stage-2
dauphine/2008/stage-3
dauphine/2008/stage-6
dauphine/2008/stage-7
dauphine/2009/stage-4
dauphine/2009/stage-6
dauphine/2011/stage-2
dauphine/2015/stage-2
dauphine/2016/stage-4
dauphine/2017/stage-1
dauphine/2022/stage-1
dauphine/2022/stage-2
dauphine/2023/stage-1
dauphine/2023/stage-3
dauphine/2023/stage-5
dwars-door-vlaanderen/2003/result
e3-harelbeke/1976/result
e3-harelbeke/2023/result
giro-d-italia/1991/stage-10
giro-d-italia/1992/stage-3
giro-d-italia/1994/stage-8
giro-d-italia/1995/stage-14
giro-d-italia/1995/stage-17
giro-d-italia/1995/stage-5
giro-d-italia/1995/stage-8
giro-d-italia/1997/stage-13
giro-d-italia/1997/stage-3
giro-d-italia/1999/stage-7
giro-d-italia/2002/stage-2
giro-d-italia/2002/stage-3
giro-d-italia/2002/stage-4
giro-d-italia/2003/stage-1
giro-d-italia/2005/prologue
giro-d-italia/2005/stage-1
giro-d-italia/2005/stage-15
giro-d-italia/2005/stage-2
giro-d-ita