# Checks of races data

In [1]:
# Autoreload allows the notebook to dynamically load code: if we update some helper functions *outside* of the notebook, we do not need to reload the notebook.
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import re
import unicodedata

We load the dataset from a CSV file and display the first few rows to get an initial understanding of the data. This helps us verify that the data has been loaded correctly and gives us a glimpse of its structure and contents.

In [4]:
csv_file = "../data/races.csv"
dataset = pd.read_csv(csv_file)
dataset.head()

Unnamed: 0,_url,name,points,uci_points,length,climb_total,profile,startlist_quality,average_temperature,date,position,cyclist,cyclist_age,is_tarmac,is_cobbled,is_gravel,cyclist_team,delta
0,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,0,sean-kelly,22.0,True,False,False,vini-ricordi-pinarello-sidermec-1986,0.0
1,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,1,gerrie-knetemann,27.0,True,False,False,norway-1987,0.0
2,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,2,rene-bittinger,24.0,True,False,False,,0.0
3,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,3,joseph-bruyere,30.0,True,False,False,navigare-blue-storm-1993,0.0
4,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,4,sven-ake-nilsson,27.0,True,False,False,spain-1991,0.0


## Initial Info

Now we provide a concise summary of the DataFrame, including the number of non-null entries, data types of each column, and memory usage. It helps us quickly identify missing values and understand the overall structure of the dataset.

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589865 entries, 0 to 589864
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   _url                 589865 non-null  object 
 1   name                 589865 non-null  object 
 2   points               589388 non-null  float64
 3   uci_points           251086 non-null  float64
 4   length               589865 non-null  float64
 5   climb_total          442820 non-null  float64
 6   profile              441671 non-null  float64
 7   startlist_quality    589865 non-null  int64  
 8   average_temperature  29933 non-null   float64
 9   date                 589865 non-null  object 
 10  position             589865 non-null  int64  
 11  cyclist              589865 non-null  object 
 12  cyclist_age          589752 non-null  float64
 13  is_tarmac            589865 non-null  bool   
 14  is_cobbled           589865 non-null  bool   
 15  is_gravel        

Also, we generates a descriptive statistics for numerical columns in the DataFrame. It includes metrics such as count, mean, standard deviation, minimum, and maximum values, as well as the 25th, 50th, and 75th percentiles. This summary helps us understand the distribution and central tendency of the data.

In [10]:
dataset.describe()

Unnamed: 0,points,uci_points,length,climb_total,profile,startlist_quality,average_temperature,position,cyclist_age,delta
count,589388.0,251086.0,589865.0,442820.0,441671.0,589865.0,29933.0,589865.0,589752.0,589865.0
mean,89.221635,74.601547,166776.180584,2330.469215,2.611611,1101.161178,21.731768,74.219491,28.486208,418.292794
std,54.43533,100.947962,64545.605664,1375.710722,1.491741,380.586928,5.884761,48.404023,3.855631,842.961596
min,18.0,6.0,1000.0,2.0,1.0,115.0,10.0,0.0,13.0,-6906.0
25%,50.0,16.0,152500.0,1309.0,1.0,844.0,17.0,32.0,26.0,10.0
50%,80.0,60.0,178200.0,2255.0,2.0,988.0,22.0,70.0,28.0,156.0
75%,100.0,100.0,203500.0,3273.0,4.0,1309.0,26.0,112.0,31.0,624.0
max,350.0,800.0,338000.0,6974.0,5.0,2047.0,36.0,209.0,56.0,61547.0


We use the `value_counts()` method to count the occurrences of each unique value in specified columns of a DataFrame

In [11]:
# Count numer of same values in every column

#dataset['_url'].value_counts()
#dataset['name'].value_counts()
#dataset['points'].value_counts()
#dataset['uci_points'].value_counts()
#dataset['length'].value_counts()
#dataset['climb_total'].value_counts()
#dataset['profile'].value_counts()
#dataset['startlist_quality'].value_counts()
#dataset['average_temperature'].value_counts()
#dataset['date'].value_counts()
#dataset['position'].value_counts()
#dataset['cyclist'].value_counts()
#dataset['cyclist_age'].value_counts()
#dataset['is_tarmac'].value_counts()
#dataset['is_cobbled'].value_counts()
#dataset['is_gravel'].value_counts()
#dataset['cyclist_team'].value_counts()
#dataset['delta'].value_counts()


cyclist_team
liberty-seguros-wurth-team-2005     8869
roompot-nederlandse-loterij-2018    8773
chazal-vetta-mbk-1993               8094
kondor-1979                         7895
kazakhstan-2019                     7701
                                    ... 
atala-ofmega-1988                   1259
finland-2016                        1236
south-africa-1993                   1174
denmark-2003                         216
quickstep-innergetic-2009              3
Name: count, Length: 91, dtype: int64

## Check on '_url' data

In this block we check if there are `_url` values that are not in the form name/year/stage

In [5]:
# For each data, check if '_url' object is in form name/year/stage
i=0
for index, row in dataset.iterrows():
    if pd.isnull(row['_url']):
        # Split url by / in name, year and stage
        url = row['_url'].split('/')
        # Check if name, year and stage are in url
        if len(url) != 3:
            i+=1
            print(row['_url'], i)
        else:
            name = url[0].replace('-', '')
            year = url[1]
            # Check is name contains only letters and year contains only digits
            if not name.isalpha() or year.isdigit():
                i+=1
                print(row['_url'], i)

## Check on 'name' data

In this block we are checking if there are `name` values that contains any icnorrect numbers

In [71]:
# For each data, check if 'name' object contains any number, except for names containing 'E3'
for index, row in dataset.iterrows():
    if not pd.isnull(row['name']):
        if (row['name'] != 'E3-Prijs Harelbeke'
            and row['name'] != 'E3 Harelbeke'
            and row['name'] != 'E3 Prijs Vlaanderen'
            and row['name'] != 'E3 Saxo Bank Classic'
            and row['name'] != 'E3 Harelbeke'
            and row['name'] != 'E3 Prijs Vlaanderen - Harelbeke'
            and row['name'] != 'Record Bank E3 Harelbeke'
            and row['name'] != 'E3 BinckBank Classic'
            and row['name'] != 'E3 Saxo Classic'):
            if any(char.isdigit() for char in row['name']):
                print(row['_url'], row['name'])

In [19]:
# CONTROLLA
nonNull_dataset = dataset.dropna(subset=['name']) # Drop rows with NaN values in 'name' column
urls = nonNull_dataset['_url'].dropna().unique() # Get unique urls

# For each url, check if all the 'name' values are the same
for url in urls:
    url_data = nonNull_dataset[nonNull_dataset['_url'] == url]
    if len(url_data['name'].unique()) != 1:
        print(url, url_data['name'].unique())

## Check on 'points' data

In [77]:
# For each data, check if 'points' has a '.' in it
for index, row in dataset.iterrows():
    if not pd.isnull(row['points']):
        if not '.' in str(row['points']):
            print(row['_url'], row['points'])

In [81]:
# For each data, check if 'points' float64 data is a digit
for index, row in dataset.iterrows():
    if not pd.isnull(row['points']):
        # Delete last two char from 'points'
        points = str(row['points'])[:-2]

        if not points.isdigit():
            print(row['_url'], row['points'])

In [26]:
# Get unique data based on '_url' and 'points'
unique_data = dataset.drop_duplicates(subset=['_url', 'points'])

# For each unique data, print '_url' where 'points' is NaN
for index, row in unique_data.iterrows():
    if pd.isnull(row['points']):
        print(row['_url'])

vuelta-a-espana/1994/stage-5
tour-de-france/1986/stage-19
tour-de-france/1988/prologue
tour-de-france/2019/stage-19


In [35]:
# CONTROLLA
nonNull_dataset = dataset.dropna(subset=['points']) # Drop rows with NaN values in 'points' column
urls = nonNull_dataset['_url'].dropna().unique() # Get unique urls

# For each url, check if all the 'points' values are the same
for url in urls:
    url_data = nonNull_dataset[nonNull_dataset['_url'] == url]
    if len(url_data['points'].unique()) != 1:
        print(url, url_data['points'].unique())

## Check on 'uci_points' data

In [78]:
# For each data, check if 'uci_points' has a '.' in it
for index, row in dataset.iterrows():
    if not pd.isnull(row['uci_points']):
        if not '.' in str(row['uci_points']):
            print(row['_url'], row['uci_points'])

In [82]:
# For each data, check if 'uci_points' float64 data is a digit
for index, row in dataset.iterrows():
    if not pd.isnull(row['uci_points']):
        # Delete last two char from 'uci_points'
        uci_points = str(row['uci_points'])[:-2]

        if not uci_points.isdigit():
            print(row['_url'], row['uci_points'])

In [27]:
# Get unique data based on '_url' and 'uci_points'
unique_data = dataset.drop_duplicates(subset=['_url', 'uci_points'])

# For each unique data, print '_url' where 'uci_points' is NaN
for index, row in unique_data.iterrows():
    if pd.isnull(row['uci_points']):
        print(row['_url'])

tour-de-france/1978/stage-6
volta-a-catalunya/1999/prologue
tour-de-france/1978/stage-14
volta-a-catalunya/1981/stage-2b
paris-nice/1994/stage-8b
tirreno-adriatico/1984/stage-1
dauphine/1991/stage-2
ronde-van-vlaanderen/1980/result
milano-sanremo/2004/result
tour-de-suisse/1997/stage-6
tirreno-adriatico/2005/stage-2
tirreno-adriatico/1984/stage-5
vuelta-a-espana/1994/stage-3
vuelta-a-espana/1993/stage-21
vuelta-a-espana/1980/stage-19
giro-d-italia/1999/stage-8
tour-de-suisse/1983/stage-5b
vuelta-a-espana/1988/stage-21
vuelta-a-espana/1984/stage-11
tour-de-romandie/2009/stage-1
tirreno-adriatico/2005/stage-6
paris-nice/1985/stage-6
volta-a-catalunya/2004/stage-6
giro-d-italia/2004/stage-18
volta-a-catalunya/2012/stage-2
tour-de-france/1997/stage-15
tour-de-france/2000/stage-8
tour-de-france/1981/stage-2
volta-a-catalunya/1993/prologue
tirreno-adriatico/1999/stage-4
tour-de-france/1981/stage-15
tour-de-suisse/1998/stage-1
dauphine/1983/stage-6
la-fleche-wallone/2015/result
e3-harelbeke/2

In [36]:
nonNull_dataset = dataset.dropna(subset=['uci_points']) # Drop rows with NaN values in 'uci_points' column
urls = nonNull_dataset['_url'].dropna().unique() # Get unique urls

# For each url, check if all the 'uci_points' values are the same
for url in urls:
    url_data = nonNull_dataset[nonNull_dataset['_url'] == url]
    if len(url_data['uci_points'].unique()) != 1:
        print(url, url_data['uci_points'].unique())

## Check on 'length' data

In [79]:
# For each data, check if 'length' has .0 at the end
for index, row in dataset.iterrows():
    if not pd.isnull(row['length']):
        if not str(row['length']).endswith('.0'):
            print(row['_url'], row['length'])

tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
tour-de-suisse/1992/stage-4 32200.000000000004
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
amstel-gold-race/2009/result 258600.00000000003
am

In [85]:
# For each data, check if 'length' float64 data is a digit
for index, row in dataset.iterrows():
    if not pd.isnull(row['length']):
        # Delete '.'
        length = str(row['length']).replace('.', '')

        if not length.isdigit():
            print(row['_url'], row['length'])

In [28]:
# Get unique data based on '_url' and 'length'
unique_data = dataset.drop_duplicates(subset=['_url', 'length'])

# For each unique data, print '_url' where 'length' is NaN
for index, row in unique_data.iterrows():
    if pd.isnull(row['length']):
        print(row['_url'])

In [17]:
# For each data, check if 'length' is more than n
unique_data = dataset.dropna(subset=['length']).drop_duplicates(subset=['_url', 'length']) # Get unique data based on '_url' and 'length'
n = 300000
for index, row in unique_data.iterrows():
    if not pd.isnull(row['length']):
        if row['length'] > n:
            print(row['_url'], row['length'])

milano-sanremo/2020/result 305000.0
tour-de-france/1984/stage-9 338000.0
tour-de-france/1984/stage-21 320500.0
tour-de-france/1990/stage-5 301000.0


In [37]:
nonNull_dataset = dataset.dropna(subset=['length']) # Drop rows with NaN values in 'length' column
urls = nonNull_dataset['_url'].dropna().unique() # Get unique urls

# For each url, check if all the 'length' values are the same
for url in urls:
    url_data = nonNull_dataset[nonNull_dataset['_url'] == url]
    if len(url_data['length'].unique()) != 1:
        print(url, url_data['length'].unique())

## Check on 'climb_total' data

In [86]:
# For each data, check if 'climb_total' has .0 at the end
for index, row in dataset.iterrows():
    if not pd.isnull(row['climb_total']):
        if not str(row['climb_total']).endswith('.0'):
            print(row['_url'], row['climb_total'])

In [87]:
# For each data, check if 'climb_total' float64 data is a digit
for index, row in dataset.iterrows():
    if not pd.isnull(row['climb_total']):
        # Delete last two char from 'climb_total'
        climb_total = str(row['climb_total'])[:-2]

        if not climb_total.isdigit():
            print(row['_url'], row['climb_total'])

In [30]:
# Get unique data based on '_url' and 'climb_total'
unique_data = dataset.drop_duplicates(subset=['_url', 'climb_total'])

# For each unique data, print '_url' where 'climb_total' is NaN
for index, row in unique_data.iterrows():
    if pd.isnull(row['climb_total']):
        print(row['_url'])

volta-a-catalunya/1999/prologue
volta-a-catalunya/1981/stage-2b
paris-nice/1994/stage-8b
tirreno-adriatico/1984/stage-1
dauphine/1991/stage-2
ronde-van-vlaanderen/1980/result
milano-sanremo/2004/result
tour-de-suisse/1997/stage-6
tirreno-adriatico/2005/stage-2
tirreno-adriatico/1984/stage-5
vuelta-a-espana/1980/stage-19
giro-d-italia/1999/stage-8
tour-de-suisse/1983/stage-5b
vuelta-a-espana/1984/stage-11
tour-de-romandie/2009/stage-1
tirreno-adriatico/2005/stage-6
paris-nice/1985/stage-6
volta-a-catalunya/2004/stage-6
volta-a-catalunya/2012/stage-2
volta-a-catalunya/1993/prologue
tirreno-adriatico/1999/stage-4
tour-de-suisse/1998/stage-1
dauphine/1983/stage-6
e3-harelbeke/2002/result
tirreno-adriatico/1994/stage-6
itzulia-basque-country/2007/stage-3
vuelta-a-espana/1980/stage-18
tirreno-adriatico/1996/stage-4
tour-de-suisse/1986/stage-2
tour-de-suisse/1987/stage-4
vuelta-a-espana/1980/stage-8
liege-bastogne-liege/1999/result
tour-de-romandie/1989/stage-6
giro-d-italia/1998/stage-3
pari

In [24]:
# For each data, check if 'climb_total' is less than n
unique_data = dataset.dropna(subset=['climb_total']).drop_duplicates(subset=['_url', 'climb_total']) # Get unique data based on '_url' and 'climb_total'
n = 5
for index, row in unique_data.iterrows():
    if not pd.isnull(row['climb_total']):
        if row['climb_total'] < n:
            print(row['_url'], row['climb_total'])

giro-d-italia/2020/stage-21 3.0
tirreno-adriatico/2023/stage-1 3.0
tirreno-adriatico/2015/stage-1 2.0
tirreno-adriatico/2022/stage-1 4.0


In [38]:
nonNull_dataset = dataset.dropna(subset=['climb_total']) # Drop rows with NaN values in 'climb_total' column
urls = nonNull_dataset['_url'].dropna().unique() # Get unique urls

# For each url, check if all the 'climb_total' values are the same
for url in urls:
    url_data = nonNull_dataset[nonNull_dataset['_url'] == url]
    if len(url_data['climb_total'].unique()) != 1:
        print(url, url_data['climb_total'].unique())

## Check on 'profile' data

In [89]:
# For each data, check if 'profile' has .0 at the end
for index, row in dataset.iterrows():
    if not pd.isnull(row['profile']):
        if not str(row['profile']).endswith('.0'):
            print(row['_url'], row['profile'])

In [90]:
# For each data, check if 'profile' float64 data is a digit
for index, row in dataset.iterrows():
    if not pd.isnull(row['profile']):
        # Delete last two char from 'profile'
        profile = str(row['profile'])[:-2]

        if not profile.isdigit():
            print(row['_url'], row['profile'])

In [31]:
# Get unique data based on '_url' and 'profile'
unique_data = dataset.drop_duplicates(subset=['_url', 'profile'])

# For each unique data, print '_url' where 'profile' is NaN
for index, row in unique_data.iterrows():
    if pd.isnull(row['profile']):
        print(row['_url'])

volta-a-catalunya/1999/prologue
volta-a-catalunya/1981/stage-2b
paris-nice/1994/stage-8b
tirreno-adriatico/1984/stage-1
dauphine/1991/stage-2
ronde-van-vlaanderen/1980/result
milano-sanremo/2004/result
tour-de-suisse/1997/stage-6
tirreno-adriatico/1984/stage-5
vuelta-a-espana/1994/stage-3
vuelta-a-espana/1993/stage-21
vuelta-a-espana/1980/stage-19
tour-de-suisse/1983/stage-5b
vuelta-a-espana/1988/stage-21
vuelta-a-espana/1984/stage-11
tour-de-romandie/2009/stage-1
paris-nice/1985/stage-6
volta-a-catalunya/1993/prologue
tirreno-adriatico/1999/stage-4
tour-de-suisse/1998/stage-1
dauphine/1983/stage-6
tirreno-adriatico/1994/stage-6
itzulia-basque-country/2007/stage-3
itzulia-basque-country/2012/stage-1
vuelta-a-espana/1988/stage-18
vuelta-a-espana/1980/stage-18
vuelta-a-espana/1994/stage-8
vuelta-a-espana/1993/stage-4
tirreno-adriatico/1996/stage-4
tour-de-suisse/1986/stage-2
tour-de-suisse/1987/stage-4
vuelta-a-espana/1980/stage-8
liege-bastogne-liege/1999/result
tour-de-romandie/1989/st

In [39]:
nonNull_dataset = dataset.dropna(subset=['profile']) # Drop rows with NaN values in 'profile' column
urls = nonNull_dataset['_url'].dropna().unique() # Get unique urls

# For each url, check if all the 'profile' values are the same
for url in urls:
    url_data = nonNull_dataset[nonNull_dataset['_url'] == url]
    if len(url_data['profile'].unique()) != 1:
        print(url, url_data['profile'].unique())

## Check on 'startlist_quality' data

In [93]:
# For each data, check if 'startlist_quality' float64 data is a digit
for index, row in dataset.iterrows():
    if not pd.isnull(row['startlist_quality']):
        if not row['startlist_quality'].is_integer():
            print(row['_url'], row['startlist_quality'])   

In [40]:
nonNull_dataset = dataset.dropna(subset=['startlist_quality']) # Drop rows with NaN values in 'startlist_quality' column
urls = nonNull_dataset['_url'].dropna().unique() # Get unique urls

# For each url, check if all the 'startlist_quality' values are the same
for url in urls:
    url_data = nonNull_dataset[nonNull_dataset['_url'] == url]
    if len(url_data['startlist_quality'].unique()) != 1:
        print(url, url_data['startlist_quality'].unique())

## Check on 'average_temperature' data

In [94]:
# For each data, check if 'average_temperature' has .0 at the end
for index, row in dataset.iterrows():
    if not pd.isnull(row['average_temperature']):
        if not str(row['average_temperature']).endswith('.0'):
            print(row['_url'], row['average_temperature'])

In [95]:
# For each data, check if 'average_temperature' float64 data is a digit
for index, row in dataset.iterrows():
    if not pd.isnull(row['average_temperature']):
        # Delete last two char from 'average_temperature'
        average_temperature = str(row['average_temperature'])[:-2]

        if not average_temperature.isdigit():
            print(row['_url'], row['average_temperature'])

In [33]:
# Get unique data based on '_url' and 'average_temperature'
unique_data = dataset.drop_duplicates(subset=['_url', 'average_temperature'])

# For each unique data, print '_url' where 'average_temperature' is NaN
for index, row in unique_data.iterrows():
    if pd.isnull(row['average_temperature']):
        print(row['_url'])

tour-de-france/1978/stage-6
vuelta-a-espana/2016/stage-14
tour-de-france/2019/stage-21
volta-a-catalunya/1999/prologue
vuelta-a-espana/2017/stage-20
tour-de-france/1978/stage-14
volta-a-catalunya/2018/stage-7
volta-a-catalunya/1981/stage-2b
paris-nice/1994/stage-8b
tirreno-adriatico/1984/stage-1
dauphine/1991/stage-2
ronde-van-vlaanderen/1980/result
volta-a-catalunya/2015/stage-4
milano-sanremo/2004/result
tour-de-suisse/1997/stage-6
tirreno-adriatico/2005/stage-2
giro-d-italia/2019/stage-14
tirreno-adriatico/1984/stage-5
vuelta-a-espana/1994/stage-3
vuelta-a-espana/1993/stage-21
tour-de-suisse/2021/stage-4
gran-camino/2022/stage-2
vuelta-a-espana/1980/stage-19
giro-d-italia/1999/stage-8
tour-de-suisse/1983/stage-5b
vuelta-a-espana/1988/stage-21
volta-a-catalunya/2022/stage-3
vuelta-a-espana/1984/stage-11
tour-de-romandie/2009/stage-1
tirreno-adriatico/2005/stage-6
paris-nice/1985/stage-6
volta-a-catalunya/2004/stage-6
giro-d-italia/2004/stage-18
volta-a-catalunya/2012/stage-2
vuelta-a

In [41]:
nonNull_dataset = dataset.dropna(subset=['average_temperature']) # Drop rows with NaN values in 'average_temperature' column
urls = nonNull_dataset['_url'].dropna().unique() # Get unique urls

# For each url, check if all the 'average_temperature' values are the same
for url in urls:
    url_data = nonNull_dataset[nonNull_dataset['_url'] == url]
    if len(url_data['average_temperature'].unique()) != 1:
        print(url, url_data['average_temperature'].unique())

## Check on 'date' data

In [98]:
# For each data, check if 'date' is in the format yyyy-mm-dd hh:mm:ss
for index, row in dataset.iterrows():
    if not pd.isnull(row['date']):
        if not re.match(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', row['date']):
            print(row['_url'], row['date'])

In [42]:
nonNull_dataset = dataset.dropna(subset=['date']) # Drop rows with NaN values in 'date' column
urls = nonNull_dataset['_url'].dropna().unique() # Get unique urls

# For each url, check if all the 'date' values are the same, but considering only the date and not the time
for url in urls:
    url_data = nonNull_dataset[nonNull_dataset['_url'] == url]
    if len(url_data['date'].str.split(' ').str[0].unique()) != 1:
        print(url, url_data['date'].str.split(' ').str[0].unique())

## Check on 'position' data

In [100]:
# For each data, check if 'position' data is a digit
for index, row in dataset.iterrows():
    if not pd.isnull(row['position']):
        if not row['position'].is_integer():
            print(row['_url'], row['position']) 

In [112]:
# For each '_url', check if there are all the 'position' from 0 to the max one after the other
urls = dataset['_url'].unique()
for url in urls:
    positions = dataset.loc[dataset['_url'] == url]['position']
    if not all(positions == np.arange(positions.max() + 1)):
        print(url, positions)

## Check on 'cyclist' data

In [None]:
# For each data, check if 'cyclist' is in the cyclist.csv file
cyclist_csv_file = "dataset/cyclists.csv"
cyclist_dataset = pd.read_csv(cyclist_csv_file)

for index, row in dataset.iterrows():
    if not pd.isnull(row['cyclist']):
        if not row['cyclist'] in cyclist_dataset['_url'].values:
            print(row['_url'], row['cyclist'])

## Check on 'cyclist_age' data

In [96]:
# For each data, check if 'cyclist_age' has .0 at the end
for index, row in dataset.iterrows():
    if not pd.isnull(row['cyclist_age']):
        if not str(row['cyclist_age']).endswith('.0'):
            print(row['_url'], row['cyclist_age'])

In [97]:
# For each data, check if 'cyclist_age' float64 data is a digit
for index, row in dataset.iterrows():
    if not pd.isnull(row['cyclist_age']):
        # Delete last two char from 'cyclist_age'
        cyclist_age = str(row['cyclist_age'])[:-2]

        if not cyclist_age.isdigit():
            print(row['_url'], row['cyclist_age'])

In [34]:
# Get unique data based on '_url' and 'cyclist_age'
unique_data = dataset.drop_duplicates(subset=['_url', 'cyclist_age'])

# For each unique data, print '_url' where 'cyclist_age' is NaN
for index, row in unique_data.iterrows():
    if pd.isnull(row['cyclist_age']):
        print(row['_url'])

giro-d-italia/2019/stage-14
vuelta-a-espana/2020/stage-7
amstel-gold-race/2018/result
vuelta-a-espana/2020/stage-11
paris-nice/2019/stage-3
ronde-van-vlaanderen/1990/result
giro-d-italia/2019/stage-16
la-fleche-wallone/1997/result
vuelta-a-espana/1985/stage-12
vuelta-a-espana/1985/stage-19
vuelta-a-espana/2020/stage-4
giro-d-italia/2019/stage-9
giro-d-italia/2019/stage-18
tour-de-romandie/1993/stage-1
vuelta-a-espana/1985/stage-18
vuelta-a-espana/1985/stage-5
paris-nice/2019/stage-1
giro-d-italia/2019/stage-5
dauphine/2000/prologue
giro-d-italia/2019/stage-3
giro-d-italia/2019/stage-13
vuelta-a-espana/1985/stage-17
vuelta-a-espana/1985/prologue
giro-d-italia/2019/stage-6
vuelta-a-espana/2020/stage-18
giro-d-italia/2019/stage-8
giro-d-italia/2019/stage-10
vuelta-a-espana/2020/stage-5
vuelta-a-espana/1985/stage-16
vuelta-a-espana/1986/stage-1
paris-nice/2019/stage-5
giro-d-italia/2019/stage-12
vuelta-a-espana/1985/stage-8
tour-de-suisse/2018/stage-5
giro-d-italia/2019/stage-2
san-sebasti

## Check on 'is_tarmac' data

In [None]:
# For each data, check if 'is_tarmac' is a boolean
for index, row in dataset.iterrows():
    if not pd.isnull(row['is_tarmac']):
        if not isinstance(row['is_tarmac'], bool):
            print(row['_url'], row['is_tarmac'])

In [43]:
nonNull_dataset = dataset.dropna(subset=['is_tarmac']) # Drop rows with NaN values in 'is_tarmac' column
urls = nonNull_dataset['_url'].dropna().unique() # Get unique urls

# For each url, check if all the 'is_tarmac' values are the same
for url in urls:
    url_data = nonNull_dataset[nonNull_dataset['_url'] == url]
    if len(url_data['is_tarmac'].unique()) != 1:
        print(url, url_data['is_tarmac'].unique())

## Check on 'is_cobbled' data

In [None]:
# For each data, check if 'is_cobbled' is a boolean
for index, row in dataset.iterrows():
    if not pd.isnull(row['is_cobbled']):
        if not isinstance(row['is_cobbled'], bool):
            print(row['_url'], row['is_cobbled'])

In [44]:
nonNull_dataset = dataset.dropna(subset=['is_cobbled']) # Drop rows with NaN values in 'is_cobbled' column
urls = nonNull_dataset['_url'].dropna().unique() # Get unique urls

# For each url, check if all the 'is_cobbled' values are the same
for url in urls:
    url_data = nonNull_dataset[nonNull_dataset['_url'] == url]
    if len(url_data['is_cobbled'].unique()) != 1:
        print(url, url_data['is_cobbled'].unique())

## Check on 'is_gravel' data

In [None]:
# For each data, check if 'is_gravel' is a boolean
for index, row in dataset.iterrows():
    if not pd.isnull(row['is_gravel']):
        if not isinstance(row['is_gravel'], bool):
            print(row['_url'], row['is_gravel'])

In [45]:
nonNull_dataset = dataset.dropna(subset=['is_gravel']) # Drop rows with NaN values in 'is_gravel' column
urls = nonNull_dataset['_url'].dropna().unique() # Get unique urls

# For each url, check if all the 'is_gravel' values are the same
for url in urls:
    url_data = nonNull_dataset[nonNull_dataset['_url'] == url]
    if len(url_data['is_gravel'].unique()) != 1:
        print(url, url_data['is_gravel'].unique())

## Check on 'cyclist_team' data

In [114]:
# For each data, check if 'cyclist_team' is in the formato team-year, where the last four characters are digits
for index, row in dataset.iterrows():
    if not pd.isnull(row['cyclist_team']):
        if not re.match(r'.+-\d{4}', row['cyclist_team']):
            print(row['_url'], row['cyclist_team'])

In [116]:
# For each data, check if 'cyclist_team' has the last four characters as digits
for index, row in dataset.iterrows():
    if not pd.isnull(row['cyclist_team']):
        if not row['cyclist_team'][-4:].isdigit():
            print(row['_url'], row['cyclist_team'])

## Check on 'delta' data

In [19]:
# For each data, check if 'delta' has .0 at the end
for index, row in dataset.iterrows():
    if not pd.isnull(row['delta']):
        if not str(row['delta']).endswith('.0'):
            print(row['_url'], row['delta'])

In [18]:
# For each data, check if 'delta' float64 data is a digit
for index, row in dataset.iterrows():
    if not pd.isnull(row['delta']):
        # Delete last two char from 'delta'
        delta = str(row['delta'])[:-2]

        if not delta.isdigit():
            print(row['_url'], row['delta'])

vuelta-a-espana/1992/stage-19 -2635.0
vuelta-a-espana/1992/stage-19 -2638.0
vuelta-a-espana/1992/stage-19 -2541.0
vuelta-a-espana/1992/stage-19 -2542.0
vuelta-a-espana/1992/stage-19 -2545.0
vuelta-a-espana/1992/stage-19 -2546.0
vuelta-a-espana/1992/stage-19 -2550.0
vuelta-a-espana/1992/stage-19 -2560.0
vuelta-a-espana/1992/stage-19 -2564.0
vuelta-a-espana/1992/stage-19 -2567.0
vuelta-a-espana/1992/stage-19 -2567.0
vuelta-a-espana/1992/stage-19 -2574.0
vuelta-a-espana/1992/stage-19 -2469.0
vuelta-a-espana/1992/stage-19 -2473.0
vuelta-a-espana/1992/stage-19 -2475.0
vuelta-a-espana/1992/stage-19 -2477.0
vuelta-a-espana/1992/stage-19 -2479.0
vuelta-a-espana/1992/stage-19 -2481.0
vuelta-a-espana/1992/stage-19 -2482.0
vuelta-a-espana/1992/stage-19 -2485.0
vuelta-a-espana/1992/stage-19 -2486.0
vuelta-a-espana/1992/stage-19 -2486.0
vuelta-a-espana/1992/stage-19 -2487.0
vuelta-a-espana/1992/stage-19 -2500.0
vuelta-a-espana/1992/stage-19 -2504.0
vuelta-a-espana/1992/stage-19 -2505.0
vuelta-a-esp

In [23]:
# For each data, check if 'delta' is less than 0
for index, row in dataset.iterrows():
    if not pd.isnull(row['delta']):
        if row['delta'] < 0:
            print(row['_url'], row['delta'])

        if not delta.isdigit():
            print(row['_url'], row['delta'])

vuelta-a-espana/1992/stage-19 -2635.0
vuelta-a-espana/1992/stage-19 -2638.0
vuelta-a-espana/1992/stage-19 -2541.0
vuelta-a-espana/1992/stage-19 -2542.0
vuelta-a-espana/1992/stage-19 -2545.0
vuelta-a-espana/1992/stage-19 -2546.0
vuelta-a-espana/1992/stage-19 -2550.0
vuelta-a-espana/1992/stage-19 -2560.0
vuelta-a-espana/1992/stage-19 -2564.0
vuelta-a-espana/1992/stage-19 -2567.0
vuelta-a-espana/1992/stage-19 -2567.0
vuelta-a-espana/1992/stage-19 -2574.0
vuelta-a-espana/1992/stage-19 -2469.0
vuelta-a-espana/1992/stage-19 -2473.0
vuelta-a-espana/1992/stage-19 -2475.0
vuelta-a-espana/1992/stage-19 -2477.0
vuelta-a-espana/1992/stage-19 -2479.0
vuelta-a-espana/1992/stage-19 -2481.0
vuelta-a-espana/1992/stage-19 -2482.0
vuelta-a-espana/1992/stage-19 -2485.0
vuelta-a-espana/1992/stage-19 -2486.0
vuelta-a-espana/1992/stage-19 -2486.0
vuelta-a-espana/1992/stage-19 -2487.0
vuelta-a-espana/1992/stage-19 -2500.0
vuelta-a-espana/1992/stage-19 -2504.0
vuelta-a-espana/1992/stage-19 -2505.0
vuelta-a-esp