Cleaning the dataset is the first part of the Tardis project. 
We have to select which values are mandatory, which must be corrected and which must be ignored.

In [255]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt

The clean_average function handles the average column. It returns a new dataframe with the column specified according to the following rules
    - N/A or <null> value -> 0.0
    - Values formated to HOUR/MIN/SEC (as a string)

In [256]:
def clean_average(column, dataframe):
    df = dataframe.copy()
    
    #possibly add verif for aberrant values
    df[column] = df[column].fillna(0.0)
    total_seconds = (df[column] * 60).astype(int)
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    df[column] = hours.astype(str) + '.' + minutes.astype(str) + '.' + seconds.astype(str)
    return df

def hms_to_minutes(hms):
    if pd.isna(hms):
        return None
    parts = str(hms).split('.')
    if len(parts) == 3:
        hours, minutes, seconds = map(int, parts)
        return hours * 60 + minutes + seconds / 60
    else:
        return None

The clean_Pct function handles the Pct column. It returns a new dataframe with the column specified according to the following rules
    - Pct > 100 -> <null>
    - round values to 2 digits

In [257]:
def clean_Pct(column, dataframe):
    df = dataframe.copy()

    df[column] = df[column].where(df[column] < 100.0, np.nan)
    df[column] = df[column].round(2)
    return df

The handle_column_with_train_number handles the columns that contain a number of train. It returns a new dataframe with the column specified according to the following rules:
    - replace non numeric values or float values to null
    - check if the values is < to the number of train at departure (because it is not possible to have more delayed trains then trains at departure)

In [258]:
def handle_column_with_train_number(column, dataframe):
    df = dataframe.copy()

    df[column] = df[column].replace(['N/A', ''], np.nan)
    scheduled = pd.to_numeric(df['Number of scheduled trains'].replace(['N/A', ''], np.nan), errors='coerce')
    df[column] = df[column].where(df[column] <= scheduled, np.nan)
    df[column] = pd.to_numeric(df[column], errors='coerce')
    df[column] = df[column].where(df[column].apply(lambda x: pd.isna(x) or x.is_integer()))
    df[column] = df[column].fillna(0).astype(int)
    return df


The tab containing all the destinations for spelling correction and the diff function to do it.

In [259]:
all_city = [
    "PARIS MONTPARNASSE",
    "QUIMPER",
    "ST MALO",
    "ST PIERRE DES CORPS",
    "STRASBOURG",
    "PARIS NORD",
    "LYON PART DIEU",
    "TOURCOING",
    "NANTES",
    "PARIS VAUGIRARD",
    "BORDEAUX ST JEAN",
    "PARIS LYON",
    "MARNE LA VALLEE",
    "CHAMBERY CHALLES LES EAUX",
    "MARSEILLE ST CHARLES",
    "FRANCFORT",
    "ZURICH",
    "ANGOULEME",
    "POITIERS",
    "TOURS",
    "METZ",
    "REIMS",
    "PARIS EST",
    "DOUAI",
    "MULHOUSE VILLE",
    "VALENCE ALIXAN TGV",
    "STUTTGART",
    "BARCELONA",
    "ANGERS SAINT LAUD",
    "LAVAL",
    "NANCY",
    "LILLE",
    "GRENOBLE",
    "LE CREUSOT MONTCEAU MONTCHANIN",
    "MACON LOCHE",
    "NIMES",
    "ITALIE",
    "RENNES",
    "BREST",
    "LA ROCHELLE VILLE",
    "LE MANS",
    "VANNES",
    "DUNKERQUE",
    "AVIGNON TGV",
    "BELLEGARDE (AIN)",
    "BESANCON FRANCHE COMTE TGV",
    "DIJON VILLE",
    "MONTPELLIER",
    "MADRID",
    "ARRAS",
    "AIX EN PROVENCE TGV",
    "ANNECY",
    "NICE VILLE",
    "SAINT ETIENNE CHATEAUCREUX",
    "TOULON",
    "GENEVE",
    "PERPIGNAN",
    "LAUSANNE",
    "TOULOUSE MATABIAU"
]

def diff(string, to_cmp):
    nb_error = 0
    for i in range(len(string)):
        if string[i] != to_cmp[i]:
            nb_error += 1
    return nb_error

We start by reading he csv file and removing duplicates values.

In [260]:
df = pd.read_csv('dataset.csv', delimiter=';', on_bad_lines='warn')
df.drop_duplicates(inplace=True, subset=['Date', 'Departure station', 'Arrival station'])
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)

We clean the 'Date' column by applying a YYYY/MM format, correcting small mistakes and removing incoherent values like e.g. 2018-14 or 2142-03
We decided to not make the 'Date' mandatory information, so a null value doesn't remove the entire line.

In [261]:
df.dropna(subset=['Date'], inplace=True)
df["Date"] = df["Date"].str.replace('[^0-9]', '', regex=True)
df["Date"] = df["Date"].apply(lambda x: str(x))

valid_format = df['Date'].str.match(r'^\d{6}$')
df['Date'] = df['Date'].where(valid_format, '--------')

df["Date"] = df["Date"].apply(lambda x: x[0:4] + '-' + x[4:6])
df["Date"] = df["Date"].str.replace('nan-', '')
df["Date"] = df["Date"].str.replace('-------', '')

df['Date'] = df['Date'].where(pd.to_numeric(df['Date'].str[-2:].astype('str')) <= 12, '')
df['Date'] = df['Date'].where(pd.to_numeric(df['Date'].str[:4].astype('str')) <= 2025, '')
df = df.drop(df[df['Date'] == ''].index)

We create a new column 'Month' for future predictions

In [262]:
df['Month'] = pd.to_datetime(df['Date'], format='%Y-%m', errors='coerce').dt.month_name()

We clean the 'Service' column by correcting the spelling mistakes
We decided to not make the 'Service' mandatory information, so a null value doesn't remove the entire line.

In [263]:
df["Service"] = df["Service"].apply(lambda x: str(x))
df["Service"] = df["Service"].str.replace('nan', '')
df['Service'] = df['Service'].where(df["Service"].str.len() != 8, 'National')
df['Service'] = df['Service'].where(df["Service"].str.len() != 13, 'International')
df['Service'] = df['Service'].where((df["Service"].str.len() == 13) | (df['Service'].str.len() == 8), '')

We clean the 'Departure station' and 'Arrival station' by correcting the spelling mistakes.
We decided to make the 'Departure station' and 'Arrival station' mandatory information, to allow us to locate and verify further informations.

In [264]:
df["Departure station"] = df["Departure station"].apply(lambda x: str(x))
df = df.drop(df[df["Departure station"] == 'nan'].index)
df["Arrival station"] = df["Arrival station"].apply(lambda x: str(x))
df = df.drop(df[df["Arrival station"] == 'nan'].index)

In [265]:
for j in range(len(df)):
    station = df.at[df.index[j], 'Departure station']
    
    for element in all_city:
        if len(station) != len(element):
            continue
        if diff(station, element) == 1:
            df.at[df.index[j], 'Departure station'] = 'non valid'
            break

df = df.drop(df[df["Departure station"] == 'non valid'].index)

We convert the columns that represents a number of train to int (because you cannot have a half train) and verify if the values are coherent with the number of trains at departure
We also remove the lines where they are no scheduled trains

In [266]:
df = handle_column_with_train_number('Number of scheduled trains', df)
df = df.drop(df[df["Number of scheduled trains"] == 0].index)
df = handle_column_with_train_number('Number of cancelled trains', df)
df = handle_column_with_train_number('Number of trains delayed at departure', df)
df = handle_column_with_train_number('Number of trains delayed at arrival', df)
df = handle_column_with_train_number('Number of trains delayed > 15min', df)
df = handle_column_with_train_number('Number of trains delayed > 30min', df)
df = handle_column_with_train_number('Number of trains delayed > 60min', df)

We apply the "clean_average" function on all the average columns.
It allows us to make verification clearer and easier, and then we put them back in float format for future predictions (and for Marvin)

In [267]:
df = clean_average('Average journey time', df)
df = clean_average('Average delay of late trains at departure', df)
df = clean_average('Average delay of late trains at arrival', df)
df = clean_average('Average delay of all trains at arrival', df)
df = clean_average('Average delay of trains > 15min (if competing with flights)', df)

columns_to_convert = [
    "Average journey time",
    "Average delay of late trains at departure",
    "Average delay of all trains at departure",
    "Average delay of late trains at arrival",
    "Average delay of all trains at arrival",
    "Average delay of trains > 15min (if competing with flights)"
]

for col in columns_to_convert:
    if col in df.columns:
        df[col] = df[col].apply(hms_to_minutes)

df = df.drop(df[df["Average journey time"] >= 1440].index)

We check if the percentages are coherent (nothing > 100%) and then we apply the "clean_Pct" function on all the Pct columns.
Percentages are only useful in visualization so we don't drop the entire line if the percentages are wrong.

In [268]:
# Reset index if necessary
df = df.reset_index(drop=True)

columns_to_check = [
    'Pct delay due to passenger handling (crowding, disabled persons, connections)',
    'Pct delay due to station management and equipment reuse',
    'Pct delay due to traffic management',
    'Pct delay due to rolling stock',
    'Pct delay due to infrastructure',
    'Pct delay due to external causes'
]

for i in range(len(df)):
    total_pct = sum(df[col][i] for col in columns_to_check)

    if not (99 <= total_pct <= 101):
        for col in columns_to_check:
            df.at[i, col] = np.nan

df = clean_Pct("Pct delay due to passenger handling (crowding, disabled persons, connections)", df)
df = clean_Pct('Pct delay due to station management and equipment reuse',df)
df = clean_Pct('Pct delay due to rolling stock', df)
df = clean_Pct('Pct delay due to traffic management', df)
df = clean_Pct('Pct delay due to infrastructure', df)
df = clean_Pct('Pct delay due to external causes', df)

Finally, we convert our cleaned dataframe into a new csv file

In [269]:
df.to_csv('cleaned_dataset.csv', index=False)

In [271]:
df

Unnamed: 0,Date,Service,Departure station,Arrival station,Average journey time,Number of scheduled trains,Number of cancelled trains,Cancellation comments,Number of trains delayed at departure,Average delay of late trains at departure,Average delay of all trains at departure,Departure delay comments,Number of trains delayed at arrival,Average delay of late trains at arrival,Average delay of all trains at arrival,Arrival delay comments,Number of trains delayed > 15min,Average delay of trains > 15min (if competing with flights),Number of trains delayed > 30min,Number of trains delayed > 60min,Pct delay due to external causes,Pct delay due to infrastructure,Pct delay due to traffic management,Pct delay due to rolling stock,Pct delay due to station management and equipment reuse,"Pct delay due to passenger handling (crowding, disabled persons, connections)",Month
0,2018-01,National,LA ROCHELLE VILLE,PARIS MONTPARNASSE,165.00,222,0,,8,2.87,,,34,21.52,5.68,,22,5.68,5,0,15.38,30.77,38.46,11.54,3.85,0.00,January
1,2018-01,National,PARIS MONTPARNASSE,QUIMPER,220.00,248,1,,37,9.50,,,0,55.68,7.57,"Ce mois-ci, l'OD a été touchée par les inciden...",26,7.53,17,7,,,,,,,January
2,2018-01,National,PARIS MONTPARNASSE,ST MALO,156.00,102,0,,12,19.90,,,13,48.62,6.78,"Ce mois-ci, l'OD a été touchée par les inciden...",8,6.72,6,4,,,,,,,January
3,2018-01,National,PARIS MONTPARNASSE,ST PIERRE DES CORPS,61.00,391,2,,61,0.00,,,71,12.40,3.33,,17,3.33,6,0,21.21,42.42,9.09,21.21,6.06,0.00,January
4,2018-01,National,QUIMPER,PARIS MONTPARNASSE,223.00,256,1,,0,16.32,,,21,39.08,5.93,,21,5.92,9,0,,,,,,,January
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6884,2024-12,National,PARIS MONTPARNASSE,ST PIERRE DES CORPS,64.00,539,10,,49,12.23,,,63,0.00,2.72,,22,27.92,5,1,6.56,16.39,29.51,29.51,11.48,6.56,December
6885,2024-12,National,PARIS MONTPARNASSE,VANNES,158.00,284,2,,27,22.48,,,36,41.57,0.00,,26,52.67,0,0,33.33,16.67,19.44,22.22,5.56,2.78,December
6886,2024-12,National,QUIMPER,PARIS MONTPARNASSE,231.00,274,1,,0,0.00,,,39,56.15,9.23,,39,56.15,0,13,41.03,20.51,7.69,17.95,7.69,5.13,December
6887,2024-12,National,SAINT ETIENNE CHATEAUCREUX,PARIS LYON,168.00,110,0,,20,118.28,,,25,28.62,8.00,,19,33.40,6,1,,,,,,,December
