# Checks of races data

In [None]:
# Autoreload allows the notebook to dynamically load code: if we update some helper functions *outside* of the notebook, we do not need to reload the notebook.
%load_ext autoreload
%autoreload 2

In [100]:
import pandas as pd
import numpy as np
import re
import unicodedata

We load the dataset from a CSV file and display the first few rows to get an initial understanding of the data. This helps us verify that the data has been loaded correctly and gives us a glimpse of its structure and contents.

In [None]:
csv_file = "../data/races.csv"
dataset = pd.read_csv(csv_file)
dataset.head() 

Create a dataset without the personal information of the cyclists, taking only one row per race

In [None]:
# Delete 'position', 'cyclist', 'cyclist_age', 'cyclist_team' and 'delta' columns
races_info = dataset.drop(columns=['position', 'cyclist', 'cyclist_age', 'cyclist_team', 'delta'])

# For each row in 'races_info', take only the year-month-day part of 'date' (delete the time)
races_info['date'] = races_info['date'].str.split(' ').str[0]

# Eliminate duplicates
races_info = races_info.drop_duplicates()

# Display the first rows of the dataset
races_info.head()

Create dataset from the union of the cyclists and the races data 

In [None]:
# Create union of two datasets, merging them considering the url of the cyclist
dataset_cyclists = pd.read_csv("../data/cyclists.csv")
dataset_races = pd.read_csv("../data/races.csv")
merged_dataset = pd.merge(dataset_cyclists, dataset_races, left_on='_url', right_on='cyclist', how='inner')

# Modify name column of the cyclist url in '_url_cyclist', and name column of the race url in '_url_race'
merged_dataset = merged_dataset.rename(columns={'_url_x': '_url_cyclist', '_url_y': '_url_race'})
# Modify name column of the cyclist name in 'name_cyclist', and name column of the race name in 'name_race'
merged_dataset = merged_dataset.rename(columns={'name_x': 'name_cyclist', 'name_y': 'name_race'})
# Take only the year-month-day part of 'date' (delete the time)
merged_dataset['date'] = merged_dataset['date'].str.split(' ').str[0]

merged_dataset.head()


## Initial Info

Now we provide a concise summary of the DataFrame, including the number of non-null entries, data types of each column, and memory usage. It helps us quickly identify missing values and understand the overall structure of the dataset.

In [None]:
dataset.info()

Also, we generates a descriptive statistics for numerical columns in the DataFrame. It includes metrics such as count, mean, standard deviation, minimum, and maximum values, as well as the 25th, 50th, and 75th percentiles. This summary helps us understand the distribution and central tendency of the data.

In [None]:
dataset.describe()

## Check on '_url' data

We start considering the `_url` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in _url column: ' + str(dataset['_url'].isnull().sum())
      + ' (' + str(round(dataset['_url'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in _url column:')
url_counts = dataset['_url'].value_counts()
print(url_counts)

We have lots of different values, but no null values

In this block we check if there are `_url` values that are not in the form name/year/stage

In [None]:
# Split url by / in name, year and stage
url_split = dataset['_url'].str.split('/', expand=True)
# Check null elements in url_split[0], url_split[1] and url_split[2], and if url_split[1] contains only digits
invalid_rows = dataset[url_split[0].isnull() | url_split[1].isnull() | url_split[2].isnull() | ~url_split[1].str.isdigit()]
print('Number of invalid URLs: ' + str(len(invalid_rows)))

In [None]:
#TODO: Chiedere sulle gare dove ci sono un solo o pochi partecipanti

url_counts = dataset['_url'].value_counts()

# From url_counts, get the urls where the number of occurrence is less than 2
print(url_counts[url_counts < 6])

## Check on 'name' data

Now we consider the `name` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in name column: ' + str(dataset['name'].isnull().sum())
      + ' (' + str(round(dataset['name'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\n\nCount occurrences of each value in name column:')
name_counts = dataset['name'].value_counts()
print(name_counts)

We have different values, but no null values

Since it's small, we print all the values

In [None]:
# Print all the names that appear, alphabetically ordered
print(name_counts.index.sort_values())

For each url, check if all the `name` values are the same


In [None]:
# Group by '_url' and calculate the number of unique values in the 'name' column
name_uniques = dataset.groupby('_url')['name'].nunique()
# Filter the URLs with more than one unique name
multiple_names_urls = name_uniques[name_uniques > 1].index

print('Number of URLs with more than one unique name: ' + str(len(multiple_names_urls)))

In this block we are checking if there are `name` values that contains any incorrect numbers

In [None]:
# Get rows wehere 'name' value contains any number, except for names containing 'E3' (there are some races with E3 in the name)
invalid_rows = dataset[dataset['name'].str.contains(r'\d') & ~dataset['name'].str.contains('E3')]
print('Number of invalid names: ' + str(len(invalid_rows)))

## Check on 'points' data

Now we consider the `point` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in points column: ' + str(dataset['points'].isnull().sum())
      + ' (' + str(round(dataset['points'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in points column:')
point_counts = dataset['points'].value_counts()
print(point_counts)

We have not a lot of values, and few null values. Also, we see that every value is sintatically correct

For each url, check if all the `name` values are the same

In [None]:
# Group by '_url' and calculate the number of unique values in the 'points' column
points_uniques = dataset.groupby('_url')['points'].nunique()
# Filter the URLs with more than one unique points
multiple_points_urls = points_uniques[points_uniques > 1].index

print('Number of URLs with more than one unique points: ' + str(len(multiple_points_urls)))

We check the urls where `points` is null

In [None]:
# Get unique data based on '_url' and 'points'
unique_data = dataset.drop_duplicates(subset=['_url', 'points'])
# Get rows where 'points' is null
rows = unique_data[unique_data['points'].isnull()]

print('Number of rows with null points: ' + str(len(rows)))
print('\nURLs of the rows with null points:')
print(rows['_url'])

## Check on 'uci_points' data

Now we consider the `uci_points` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in uci_points column: ' + str(dataset['uci_points'].isnull().sum())
      + ' (' + str(round(dataset['uci_points'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in uci_points column:')
uci_point_counts = dataset['uci_points'].value_counts()
print(uci_point_counts)

We have different values, but a lot of null values. Also, we see that every value is sintatically correct

For each url, check if all the `uci_points` values are the same

In [None]:
# Group by '_url' and calculate the number of unique values in the 'uci_points' column
uci_points_uniques = dataset.groupby('_url')['uci_points'].nunique()
# Filter the URLs with more than one unique uci_points
multiple_uci_points_urls = uci_points_uniques[uci_points_uniques > 1].index

print('Number of URLs with more than one unique uci_points: ' + str(len(multiple_uci_points_urls)))

We check the urls where `uci_points` is null

In [None]:
# Get unique data based on '_url' and 'uci_points'
unique_data = dataset.drop_duplicates(subset=['_url', 'uci_points'])
# Get rows where 'uci_points' is null
rows = unique_data[unique_data['uci_points'].isnull()]

print('Number of rows with null uci_points: ' + str(len(rows)))
print('\nURLs of the rows with null uci_points:')
print(rows['_url'])

## Check on 'length' data

Now we consider the `length` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in length column: ' + str(dataset['length'].isnull().sum())
      + ' (' + str(round(dataset['length'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in length column:')
length_counts = dataset['length'].value_counts()
print(length_counts)

We have a lot of values, but no null values

For each url, check if all the `length` values are the same

In [None]:
# Group by '_url' and calculate the number of unique values in the 'length' column
length_uniques = dataset.groupby('_url')['length'].nunique()
# Filter the URLs with more than one unique length
multiple_length_urls = length_uniques[length_uniques > 1].index

print('Number of URLs with more than one unique length: ' + str(len(multiple_length_urls)))

Since we have a lot of values, we check if every value is sintatically correct

In [None]:
# Get rows where 'length' is not a digit
invalid_rows = dataset[~dataset['length'].astype(str).str.replace('.', '').str.isdigit()]

print('Number of invalid lengths: ' + str(len(invalid_rows)))

In [None]:
# Get rows where 'length' does not end with '.0'
invalid_rows = dataset[~dataset['length'].astype(str).str.endswith('.0')]
                                
print('Number of invalid lengths: ' + str(len(invalid_rows)))
for index, row in invalid_rows.iterrows():
    print(row['_url'], row['length'])

Check the races where the `length` value is small or large, for possible outliers

In [None]:
# Dataset info, for 'length' column
dataset['length'].describe()

In [None]:
n = 2000
# Get rows where 'length' is smaller than n
filtered_data = races_info[races_info['length'] < n]

print('Rows where length is smaller than ' + str(n) + ':')
for index, row in filtered_data.iterrows():
    # Stampa '_url' e i corrispondenti 'length'
    print(index, row['_url'], row['length'])

In [None]:
n = 300000
# Get data where 'length' is greater than n
filtered_data = races_info[races_info['length'] > n]

print('Rows where length is greater than ' + str(n) + ':')
for index, row in filtered_data.iterrows():
    # Stampa '_url' e i corrispondenti 'length'
    print(index, row['_url'], row['length'])

## Check on 'climb_total' data

Now we consider the `climb_total` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in climb_total column: ' + str(dataset['climb_total'].isnull().sum())
      + ' (' + str(round(dataset['climb_total'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in climb_total column:')
climb_total_counts = dataset['climb_total'].value_counts()
print(climb_total_counts)

We have a lot of different values, and a lot of null values

For each url, check if all the `climb_total` values are the same


In [None]:
# Group by '_url' and calculate the number of unique values in the 'climb_total' column
climb_total_uniques = dataset.groupby('_url')['climb_total'].nunique()
# Filter the URLs with more than one unique climb_total
multiple_climb_total_urls = climb_total_uniques[climb_total_uniques > 1].index

print('Number of URLs with more than one unique climb_total: ' + str(len(multiple_climb_total_urls)))

Since we have a lot of values, we check if every value is sintatically correct

In [None]:
# Get rows where 'climb_total' is not a digit
invalid_rows = dataset[~dataset['climb_total'].astype(str).str.replace('.', '').str.isdigit()].dropna(subset=['climb_total'])

print('Number of invalid climb_total: ' + str(len(invalid_rows)))

In [None]:
# Get rows where 'climb_total' does not end with '.0'
invalid_rows = dataset[~dataset['climb_total'].astype(str).str.endswith('.0')].dropna(subset=['climb_total'])

print('Number of invalid climb_total: ' + str(len(invalid_rows)))

Check the races where the `climb_total` value is small or large, for possible outliers

In [None]:
# Dataset info, for 'climb_total' column
dataset['climb_total'].describe()

In [None]:
n = 5
# Get rows where 'climb_total' is smaller than n
filtered_data = races_info[races_info['climb_total'] < n]

print('Rows where climb_total is smaller than ' + str(n) + ':')
for index, row in filtered_data.iterrows():
    # Stampa '_url' e i corrispondenti 'climb_total'
    print(index, row['_url'], row['climb_total'])

In [None]:
n = 6000
# Get data where 'climb_total' is greater than n
filtered_data = races_info[races_info['climb_total'] > n]

print('Rows where climb_total is greater than ' + str(n) + ':')
for index, row in filtered_data.iterrows():
    # Stampa '_url' e i corrispondenti 'climb_total'
    print(index, row['_url'], row['climb_total'])

## Check on 'profile' data

Now we consider the `profile` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in profile column: ' + str(dataset['profile'].isnull().sum())
      + ' (' + str(round(dataset['profile'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in profile column:')
profile_counts = dataset['profile'].value_counts()
print(profile_counts)

We have few different values, but a lot of null values. Also, we see that every value is sintatically correct

For each url, check if all the `profile` values are the same

In [None]:
# Group by '_url' and calculate the number of unique values in the 'profile' column
profile_uniques = dataset.groupby('_url')['profile'].nunique()
# Filter the URLs with more than one unique profile
multiple_profile_urls = profile_uniques[profile_uniques > 1].index

print('Number of URLs with more than one unique profile: ' + str(len(multiple_profile_urls)))

## Check on 'startlist_quality' data

Now we consider the `startlist_quality` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in startlist_quality column: ' + str(dataset['startlist_quality'].isnull().sum())
      + ' (' + str(round(dataset['startlist_quality'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in startlist_quality column:')
startlist_quality_counts = dataset['startlist_quality'].value_counts()
print(startlist_quality_counts)

We have different values, but no null values.

For each url, check if all the `startlist_quality` values are the same

In [None]:
# Group by '_url' and calculate the number of unique values in the 'startlist_quality' column
startlist_quality_uniques = dataset.groupby('_url')['startlist_quality'].nunique()
# Filter the URLs with more than one unique startlist_quality
multiple_startlist_quality_urls = startlist_quality_uniques[startlist_quality_uniques > 1].index

print('Number of URLs with more than one unique startlist_quality: ' + str(len(multiple_startlist_quality_urls)))

Since we have a lot of different values, we check if every value is sintatically correct

In [None]:
# Get rows where 'startlist_quality' is not a digit
invalid_rows = dataset[~dataset['startlist_quality'].astype(str).str.replace('.', '').str.isdigit()]

print('Number of invalid startlist_quality: ' + str(len(invalid_rows))) 

## Check on 'average_temperature' data

Now we consider the `average_temperature` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in average_temperature column: ' + str(dataset['average_temperature'].isnull().sum())
      + ' (' + str(round(dataset['average_temperature'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in average_temperature column:')
average_temperature_counts = dataset['average_temperature'].value_counts()
print(average_temperature_counts)

We have different values, and almost all the values are null. Also, we cas see that every value is sintatically correct

For each url, check if all the `average_temperature` values are the same

In [None]:
# Group by '_url' and calculate the number of unique values in the 'average_temperature' column
average_temperature_uniques = dataset.groupby('_url')['average_temperature'].nunique()
# Filter the URLs with more than one unique average_temperature
multiple_average_temperature_urls = average_temperature_uniques[average_temperature_uniques > 1].index

print('Number of URLs with more than one unique average_temperature: ' + str(len(multiple_average_temperature_urls)))

## Check on 'date' data

Now we consider the `date` column, and check the number of null values and the count the occurrences of each unique value. We do this considerig the merged dataset where we don't have time value of the date

In [None]:
print('Total number of null values in date column: ' + str(merged_dataset['date'].isnull().sum())
      + ' (' + str(round(merged_dataset['date'].isnull().sum() / len(merged_dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in date column:')
date_counts = merged_dataset['date'].value_counts()
print(date_counts)

We have different values, but no null values.

For each url, check if all the `date` values are the same

In [None]:
# Group by '_url' and calculate the number of unique values in the 'date' column
date_uniques = dataset.groupby('_url')['date'].nunique()
# Filter the URLs with more than one unique date
multiple_date_urls = date_uniques[date_uniques > 1].index

print('Number of URLs with more than one unique date: ' + str(len(multiple_date_urls)))

Since we have a lot of different values, we check if every value is sintatically correct

In [None]:
# Check if there are any 'date' values in the format yyyy-mm-dd hh:mm:ss (in the races dataset)
invalid_rows = dataset[~dataset['date'].str.match(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')]
print('Number of invalid dates (format yyyy-mm-dd hh:mm:ss, in the races dataset): ' + str(len(invalid_rows)))

# Check if there are any 'date' values not in the format yyyy-mm-dd (in the merged dataset)
invalid_rows = merged_dataset[~merged_dataset['date'].str.match(r'\d{4}-\d{2}-\d{2}')]
print('Number of invalid dates (format yyyy-mm-dd, in the merged dataset): ' + str(len(invalid_rows)))

Check if the year is the same in both the `_url` and the `date`

In [None]:
# Split _url by / into name, year, and stage
url_split = dataset['_url'].str.split('/', expand=True) # expand=True to return a DataFrame
# Extract the year from the date column (assuming format yyyy-mm-dd hh:mm:ss)
date_year = dataset['date'].str[:4]
# Compare the year in the _url (from the second part of the split) with the year in the date
mismatched_years = dataset[(url_split[1] != date_year)]

# Print the number of rows where the year does not match
print(f"Number of rows where the year in the url does not match the year in the date: {len(mismatched_years)}")


## Check on 'position' data

Now we consider the `position` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in position column: ' + str(dataset['position'].isnull().sum())
      + ' (' + str(round(dataset['position'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in position column:')
position_counts = dataset['position'].value_counts()
print(position_counts)

We have different values, and no null values.

Since we have different values, we check if every value is sintatically correct

In [None]:
# Check if there are any 'position' value that is not a digit
invalid_rows = dataset[~dataset['position'].astype(str).str.replace('.', '').str.isdigit()]

print('Number of invalid positions: ' + str(len(invalid_rows)))

For each url, check if there are all the `position` values

In [None]:
# Function to check if the positions are from 0 to the max one after the other
def check_positions(positions):
    return np.array_equal(np.sort(positions), np.arange(positions.max() + 1))

# Apply the function to the dataset
invalid_urls = dataset.groupby('_url')['position'].apply(lambda x: not check_positions(x))

# Stampa gli '_url' che non rispettano la condizione
print('Number of URLs with invalid positions: ' + str(len(invalid_urls[invalid_urls])))


## Check on 'cyclist' data

Now we consider the `cyclist` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in cyclist column: ' + str(dataset['cyclist'].isnull().sum())
      + ' (' + str(round(dataset['cyclist'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in cyclist column:')
cyclist_counts = dataset['cyclist'].value_counts()
print(cyclist_counts)

We have lots of different values, but no null values.

We check if all the cyclists are different in the same race

In [None]:
# For each url, check if the a cyclist appears more than once
url_cyclist_count = dataset.groupby('_url')['cyclist'].value_counts()
invalid_entries = url_cyclist_count[url_cyclist_count > 1]

print('Number of URLs with a cyclist appearing more than once: ' + str(len(invalid_entries.index.get_level_values(0))))

# Estrai gli _url e i ciclisti che compaiono più volte
for (url, cyclist), count in invalid_entries.items():
    print(f"URL: {url}, Cyclist: {cyclist}, Count: {count}")

Considering the two datasets, we check if all the cyclists in cyclists.csv are in races.csv, and vice versa.

In [None]:
# Check if there are any 'cyclist' values in the races dataset that are not in the cyclists dataset
invalid_rows = dataset[~dataset['cyclist'].isin(dataset_cyclists['_url'])]

print('Number of cyclists with no info: ' + str(len(invalid_rows)))

In [None]:
# Check if there are any '_url' values in the cyclists dataset that are not in the races dataset
invalid_rows = dataset_cyclists[~dataset_cyclists['_url'].isin(dataset['cyclist'])]

print('Number of cyclists that are not in any race: ' + str(len(invalid_rows)))
for index, row in invalid_rows.iterrows():
    print(row['_url'], row['name'])

## Check on 'cyclist_age' data

Now we consider the `cyclist_age` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in cyclist_age column: ' + str(dataset['cyclist_age'].isnull().sum())
      + ' (' + str(round(dataset['cyclist_age'].isnull().sum() / dataset.shape[0] * 100, 2)) + '%)')

print('\nCount occurrences of each value in cyclist_age column:')
cyclist_age_counts = dataset['cyclist_age'].value_counts()
print(cyclist_age_counts)

We have different values, and just a few of null values. Also, we see that every value is sintatically correct

In [None]:
# For each data, print '_url' where 'cyclist_age' is NaN
for index, row in dataset[dataset['cyclist_age'].isnull()].iterrows():
    print(row['_url'])

In [None]:
# Check if there are any 'cyclist_age' null values, where we have the year of birth in the cyclists dataset
invalid_rows = merged_dataset[merged_dataset['cyclist_age'].isnull() & merged_dataset['birth_year'].notnull()]

print('Number of cyclists with age info in cyclists dataset: ' + str(len(invalid_rows)))

Check the races where the `cyclist_age` value is small or large, for possible outliers

In [None]:
# Dataset info, for 'cyclist_age' column
dataset['cyclist_age'].describe()

In [None]:
n = 18
# Get rows where 'cyclist_age' is smaller than n
filtered_data = dataset[dataset['cyclist_age'] < n]

print('Rows where cyclist age is smaller than ' + str(n) + ':')
for index, row in filtered_data.iterrows():
    # Stampa '_url' e i corrispondenti 'cyclist_age'
    print(index, row['_url'], row['cyclist_age'])

In [None]:
n = 50
# Get data where 'cyclist_age' is greater than n
filtered_data = dataset[dataset['cyclist_age'] > n]

print('Rows where cyclist_age is greater than ' + str(n) + ':')
for index, row in filtered_data.iterrows():
    # Stampa '_url' e i corrispondenti 'cyclist_age'
    print(index, row['_url'], row['cyclist_age'])

## Check on 'is_tarmac' data

Now we consider the `is_tarmac` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in is_tarmac column: ' + str(dataset['is_tarmac'].isnull().sum())
      + ' (' + str(round(dataset['is_tarmac'].isnull().sum() / len(dataset) * 100, 2)) + '%)')

print('\nCount occurrences of each value in is_tarmac column:')
is_tarmac_counts = dataset['is_tarmac'].value_counts()
print(is_tarmac_counts)

We have two different values, and no null values. Also, we see that every value is sintatically correct

For each url, check if all the `is_tarmac` values are the same

In [None]:
# Group by '_url' and calculate the number of unique values in the 'is_tarmac' column
is_tarmac_uniques = dataset.groupby('_url')['is_tarmac'].nunique()
# Filter the URLs with more than one unique is_tarmac
multiple_is_tarmac_urls = is_tarmac_uniques[is_tarmac_uniques > 1].index

print('Number of URLs with more than one unique is_tarmac: ' + str(len(multiple_is_tarmac_urls)))

## Check on 'is_cobbled' data

Now we consider the `is_cobbled` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in is_cobbled column: ' + str(dataset['is_cobbled'].isnull().sum())
      + ' (' + str(round(dataset['is_cobbled'].isnull().sum() / dataset.shape[0] * 100, 2)) + '%)')

print('\nCount occurrences of each value in is_cobbled column:')
is_cobbled_counts = dataset['is_cobbled'].value_counts()
print(is_cobbled_counts)

We have one value, and no null values. Also, we see that the value is sintatically correct

## Check on 'is_gravel' data

Now we consider the `is_gravel` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in is_gravel column: ' + str(dataset['is_gravel'].isnull().sum())
      + ' (' + str(round(dataset['is_gravel'].isnull().sum() / dataset.shape[0] * 100, 2)) + '%)')

print('\nCount occurrences of each value in is_gravel column:')
is_gravel_counts = dataset['is_gravel'].value_counts()
print(is_gravel_counts)

We have one value, and no null values. Also, we see that the value is sintatically correct

## Check on 'cyclist_team' data

Now we consider the `cyclist_team` column, and check the number of null values and the count the occurrences of each unique value

In [214]:
# Print total number of null values in 'delta' column, and the percentage of null values (float with two decimal digits after the comma)
print('Total number of null values in cyclist_team column: ' + str(dataset['cyclist_team'].isnull().sum())
      + ' (' + str(round(dataset['cyclist_team'].isnull().sum() / dataset.shape[0] * 100, 2)) + '%)')

print('\nCount occurrences of each value in cyclist_team column:')
cyclist_team_counts = dataset['cyclist_team'].value_counts()
print(cyclist_team_counts)

Total number of null values in cyclist_team column: 159161 (26.98%)

Count occurrences of each value in cyclist_team column:
cyclist_team
liberty-seguros-wurth-team-2005     8869
roompot-nederlandse-loterij-2018    8773
chazal-vetta-mbk-1993               8094
kondor-1979                         7895
kazakhstan-2019                     7701
                                    ... 
atala-ofmega-1988                   1259
finland-2016                        1236
south-africa-1993                   1174
denmark-2003                         216
quickstep-innergetic-2009              3
Name: count, Length: 91, dtype: int64


We have different values, and a lot of null values.

Since we have a lot of different values, we check if every value is sintatically correct

In [114]:
# For each data, check if 'cyclist_team' is in the formato team-year, where the last four characters are digits
for index, row in dataset.iterrows():
    if not pd.isnull(row['cyclist_team']):
        if not re.match(r'.+-\d{4}', row['cyclist_team']):
            print(row['_url'], row['cyclist_team'])

In [116]:
# For each data, check if 'cyclist_team' has the last four characters as digits
for index, row in dataset.iterrows():
    if not pd.isnull(row['cyclist_team']):
        if not row['cyclist_team'][-4:].isdigit():
            print(row['_url'], row['cyclist_team'])

Check if the same cyclist is in two cyclist teams

## Check on 'delta' data

Now we consider the `delta` column, and check the number of null values and the count the occurrences of each unique value

In [None]:
print('Total number of null values in delta column: ' + str(dataset['delta'].isnull().sum())
        + ' (' + str(round(dataset['delta'].isnull().sum() / dataset.shape[0] * 100, 2)) + '%)')

print('\nCount occurrences of each value in delta column:')
delta_counts = dataset['delta'].value_counts()
print(delta_counts)

We have lots of different values, but no null values.

Since we have a lot of different values, we check if every value is sintatically correct

In [19]:
# For each data, check if 'delta' has .0 at the end
for index, row in dataset.iterrows():
    if not pd.isnull(row['delta']):
        if not str(row['delta']).endswith('.0'):
            print(row['_url'], row['delta'])

In [None]:
# For each data, check if 'delta' float64 data is a digit
for index, row in dataset.iterrows():
    if not pd.isnull(row['delta']):
        # Delete last two char from 'delta'
        delta = str(row['delta'])[:-2]

        if not delta.isdigit():
            print(row['_url'], row['delta'])

In [None]:
# For each data, check if 'delta' is less than 0
for index, row in dataset.iterrows():
    if not pd.isnull(row['delta']):
        if row['delta'] < 0:
            print(row['_url'], row['delta'])

        if not delta.isdigit():
            print(row['_url'], row['delta'])

Check if following the `positon`order, the delta is ordered too