In [1]:
import pandas as pd
import numpy as np
import git
import regex as re
pd.set_option("display.max_columns", None)

In [2]:
repo = git.Repo(".", search_parent_directories=True).git.rev_parse("--show-toplevel")
df = pd.read_csv(f"{repo}/data/mtd_combined.csv")
df = df[(df["state"] == "completed")]


In [3]:
def check_format(df, col_type_dict):
    # check time format in order to avoid errors in cleaning
    for col, col_type in col_type_dict.items():
        if col_type == "timestamp":
            df = df[(df[col].str.match(r'[0-9]{1,4}.[0-9]{1,2}.[0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}') == True) | (df[col].isna())]
            df_inconsistencies = df[~((df[col].str.match(r'[0-9]{1,4}.[0-9]{1,2}.[0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}') == True) | (df[col].isna()))]
        elif col_type == "time":
            df = df[(df[col].str.match(r'[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}') == True) | (df[col].str.contains("1899")) | (df[col].str.contains("1900") | (df[col].isna()))]
            df_inconsistencies = df[~((df[col].str.match(r'[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}') == True) | (df[col].str.contains("1899")) | (df[col].str.contains("1900")) | (df[col].isna()))]
    return (df, df_inconsistencies)

In [20]:
df, df_inconsistencies = check_format(df, {"created_at": "timestamp"}) # here use all time related columns in the dict

In [4]:
def clean_created_at(df):
    created_at = pd.to_datetime(df["created_at"])
    return created_at

In [5]:
df["created_at"] = clean_created_at(df)

In [6]:
def clean_scheduled_to(df):
    scheduled_to = pd.to_datetime(df["scheduled_to"])
    scheduled_to = scheduled_to.fillna(df["created_at"])

    scheduled_to = np.where(
        scheduled_to < df["created_at"],
        df["created_at"],
        scheduled_to
    )
    return scheduled_to

In [7]:
df["scheduled_to"] = clean_scheduled_to(df)

In [8]:
def clean_dispatched_at(df): 
    # Cast to correct dtype
    dispatched_at = pd.to_datetime(df["dispatched_at"])
    
    # Fill missing values of dispatched_at
    dispatched_at = np.where(
        dispatched_at.isna(),
        df["created_at"],
        dispatched_at
    )
    
    # Check correct ordering
    dispatched_at = np.where(
        (dispatched_at <= df["created_at"]) | (dispatched_at <= df["scheduled_to"] - pd.Timedelta(minutes = 9)),
        df["scheduled_to"] - pd.Timedelta(minutes = 8), 
        dispatched_at
    )
    dispatched_at = pd.to_datetime(dispatched_at)
    return dispatched_at

In [9]:
df["dispatched_at"] = clean_dispatched_at(df)

In [10]:
def clean_vehicle_arrived_at(df):
    arriving_push = pd.to_datetime(df["arriving_push"])
    vehicle_arrived_at = pd.to_datetime(df["vehicle_arrived_at"])
    pickup_at = pd.to_datetime(df["pickup_at"])
    
    times = [3600,60,1]
    pickup_arrival_time = df["pickup_arrival_time"].fillna("-9")
    pickup_arrival_time = pd.Series(np.where(
        pickup_arrival_time.str.contains("1899"),
        "-9",
        pickup_arrival_time
    ))
    
    pickup_arrival_time = pickup_arrival_time.str[0:8].apply(lambda row : sum([a*b for a,b in zip(times, map(int,row.split(':'))) if len(row) == 8]))

    avg_pickup_arrival_time = sum(x for x in pickup_arrival_time if x != -9) / len(list(x for x in pickup_arrival_time if x != -9))
    print(avg_pickup_arrival_time)

    vehicle_arrived_at = np.where(
        vehicle_arrived_at.isna(),
        np.where(
            (df["dispatched_at"] + pd.Timedelta(seconds=avg_pickup_arrival_time)
            < pickup_at) | (pickup_at.isna() == True),
            df["dispatched_at"] + pd.Timedelta(seconds=avg_pickup_arrival_time),
            pickup_at,
        ),
        vehicle_arrived_at,
    )

    vehicle_arrived_at = pd.to_datetime(vehicle_arrived_at)

    # Check ordering
    vehicle_arrived_at = np.where(
        (vehicle_arrived_at < arriving_push) | (vehicle_arrived_at + pd.Timedelta(minutes = 60) < df["scheduled_to"]),
        np.where(
            arriving_push.isna(),
            np.where(
                (df["dispatched_at"] + pd.Timedelta(seconds=avg_pickup_arrival_time) < df["pickup_at"]) | (df["pickup_at"].isna() == True),
                df["dispatched_at"] + pd.Timedelta(seconds=avg_pickup_arrival_time),
                df["pickup_at"]
            ),
            arriving_push
        
        ),
        vehicle_arrived_at
    )

    vehicle_arrived_at = pd.to_datetime(vehicle_arrived_at)
    vehicle_arrived_at = vehicle_arrived_at.floor("s")
    
    return vehicle_arrived_at

In [11]:
df["vehicle_arrived_at"] = clean_vehicle_arrived_at(df)

433.89880707527766


In [12]:
def clean_arriving_push(df):
    arriving_push = pd.to_datetime(df["arriving_push"])
    arriving_push = arriving_push.fillna(
        df["vehicle_arrived_at"] - pd.Timedelta(minutes=3)
    )
    return arriving_push

In [13]:
df["arriving_push"] = clean_arriving_push(df)

In [14]:
def clean_pickup_arrival_time(df):
    pickup_arrival_time = (df['vehicle_arrived_at'] - df['dispatched_at']).dt.seconds

    return pickup_arrival_time

In [15]:
df["pickup_arrival_time"] = clean_pickup_arrival_time(df)

In [17]:
def clean_earlierst_pickup_expectation(df):
    earlierst_pickup_expectation = pd.to_datetime(df["earliest_pickup_expectation"])
    earlierst_pickup_expectation = df["scheduled_to"] - pd.Timedelta(minutes = 3)

    return earlierst_pickup_expectation

In [18]:
df["earliest_pickup_expectation"] = clean_earlierst_pickup_expectation(df)

In [19]:
def clean_pickup_at(df):
    pickup_at = pd.to_datetime(df["pickup_at"])
 
    boarding_time = pd.Series(np.where(
        df["vehicle_arrived_at"] < pickup_at,
        (pickup_at - df["vehicle_arrived_at"]).dt.seconds,
        -9
    ))
    boarding_time = boarding_time.fillna(-9)

    avg_boarding_time = sum(x for x in boarding_time if x != -9) / len(list(x for x in boarding_time if x != -9))
    print(avg_boarding_time)

    pickup_at = np.where(
        pickup_at.isna() == True,
        np.where(
            df["pickup_eta"].isna(),
            df['vehicle_arrived_at'] +  pd.Timedelta(avg_boarding_time),
            df["pickup_eta"]
        ),
        pickup_at
    )

    pickup_at = pd.to_datetime(pickup_at) 


    ## Check ordering
    pickup_at = np.where(
        (pickup_at < df["vehicle_arrived_at"]),
        np.where(
            df["pickup_eta"].isna(),
            np.where(
                (df["vehicle_arrived_at"] + pd.Timedelta(seconds=avg_boarding_time) < df["dropoff_at"]) | (df["dropoff_at"].isna() == True),
                df["vehicle_arrived_at"] + pd.Timedelta(seconds=avg_boarding_time),
                df["dropoff_at"]

            ),
            df["pickup_eta"]
        
        ),
        pickup_at
    )

    pickup_at = pd.to_datetime(pickup_at) 

    return pickup_at



In [20]:
df["pickup_at"] = clean_pickup_at(df)

77.06826286296484


In [21]:
def clean_pickup_eta(df):
    pickup_eta = pd.to_datetime(df["pickup_eta"])

    pickup_eta = pickup_eta.fillna(df["pickup_at"])

    return pickup_eta

In [22]:
df["pickup_eta"] = clean_pickup_eta(df)

In [23]:
def clean_pickup_first_eta(df):
    pickup_first_eta = pd.to_datetime(df["pickup_first_eta"])

    pickup_first_eta = pickup_first_eta.fillna(df["pickup_eta"])

    return pickup_first_eta

In [24]:
df["pickup_first_eta"] = clean_pickup_first_eta(df)

In [25]:
def clean_dropoff_at(df):
    dropoff_at = pd.to_datetime(df["dropoff_at"])
    ftr = [3600,60,1]
    shortest_ridetime = df["shortest_ridetime"].str[0:8].apply(lambda row : sum([a*b for a,b in zip(ftr, map(int,row.split(':')))]))

    dropoff_at = np.where(
        dropoff_at.isna(),
        np.where(
            df["dropoff_eta"].isna(),
            dropoff_at + pd.to_timedelta(shortest_ridetime),
            df["dropoff_eta"]
        ),
        dropoff_at
    )
    dropoff_at = pd.to_datetime(dropoff_at) 


    # Check ordering
    dropoff_at = np.where(
        (dropoff_at < df["pickup_at"]),
        np.where(
            df["dropoff_eta"].isna(),
            dropoff_at + pd.to_timedelta(shortest_ridetime),
            df["dropoff_eta"]
        
        ),
        dropoff_at
    )

    dropoff_at = pd.to_datetime(dropoff_at) 

    return dropoff_at

In [26]:
df["dropoff_at"] = clean_dropoff_at(df)

In [27]:
def clean_dropoff_eta(df):
    dropoff_eta = pd.to_datetime(df["dropoff_eta"])

    dropoff_eta = dropoff_eta.fillna(df["dropoff_at"])

    return dropoff_eta

In [28]:
df["dropoff_eta"] = clean_dropoff_eta(df)

In [29]:
def clean_dropoff_first_eta(df):
    dropoff_first_eta = pd.to_datetime(df["dropoff_first_eta"])
    ftr = [3600,60,1]
    shortest_ridetime = df["shortest_ridetime"].str[0:8].apply(lambda row : sum([a*b for a,b in zip(ftr, map(int,row.split(':')))]))

    dropoff_first_eta = dropoff_first_eta.fillna(df["pickup_first_eta"] + pd.to_timedelta(shortest_ridetime))

    return dropoff_first_eta

In [30]:
df["dropoff_first_eta"] = clean_dropoff_first_eta(df)

In [135]:
df.to_csv(f"{repo}/data/data_cleaned.csv")