In [212]:
import pandas as pd
import numpy as np
from numpy import NaN
import datetime as dt 
from datetime import timedelta
from time import gmtime
from time import strftime
import git
pd.set_option("display.max_columns", None)

In [213]:
# read simulated rides - change to your loaction
repo = git.Repo(".", search_parent_directories=True).git.rev_parse(
    "--show-toplevel"
)
df = pd.read_excel(f"{repo}/vehicle_stream_pipeline/simulated.xlsx")
# check the types of the dataframe columns 
#df.dtypes

In [214]:
df.head(1)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,dropoff_address,state,created_from_offer,created_at,scheduled_to,dispatched_at,pickup_arrival_time,arriving_push,vehicle_arrived_at,earliest_pickup_expectation,pickup_first_eta,pickup_eta,pickup_at,dropoff_first_eta,dropoff_eta,dropoff_at,updated_at,arrival_deviation,waiting_time,boarding_time,ride_time,trip_time,shortest_ridetime,delay,longer_route_factor,arrival_indicator,rating,rating_puenktlichkeit,rating_sauberkeit,rating_fahrer,rating_find_modstop,rating_other_comments,cancellation_reason,cancellation_comment,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index
0,0,,1659084933-0,0-1659084933,6046,1,,,,0.0,STANDARD,4025,1009,completed,,2022-06-01 15:21:31,2022-06-01 15:21:31,2022-06-01 15:21:31,300,2022-06-01 15:24:59.022,2022-06-01 15:26:31,2022-06-01 15:24:31,2022-06-01 15:24:31,2022-06-01 15:30:12,2022-06-01 15:27:03,2022-06-01 15:36:36.520,2022-06-01 15:38:12,2022-06-01 15:36:57,,-88,120,32,594,714,725.52,-11.52,0.82,,5,,,,,,,,,,,,,,


In [215]:
# convert all column with dates into datetime objects 
df['created_at'] = pd.to_datetime(df['created_at'])
df['scheduled_to'] = pd.to_datetime(df['scheduled_to'])
df['dispatched_at'] = pd.to_datetime(df['dispatched_at'])
df['canceled_at'] = pd.to_datetime(df['canceled_at'])
df['arriving_push'] = pd.to_datetime(df['arriving_push'])
df['vehicle_arrived_at'] = pd.to_datetime(df['vehicle_arrived_at'])
df['earliest_pickup_expectation'] = pd.to_datetime(df['earliest_pickup_expectation'])
df['pickup_first_eta'] = pd.to_datetime(df['pickup_first_eta'])
df['pickup_eta'] = pd.to_datetime(df['pickup_eta'])
df['pickup_at'] = pd.to_datetime(df['pickup_at'])
df['dropoff_first_eta'] = pd.to_datetime(df['dropoff_first_eta'])
df['dropoff_eta'] = pd.to_datetime(df['dropoff_eta'])
df['dropoff_at'] = pd.to_datetime(df['dropoff_at'])
df['updated_at'] = pd.to_datetime(df['updated_at'])

### Timestamp Ordering 

In [216]:
# check if clean is before the scheduled time  
def check_created_ordering(df):
    # create a list of our conditions
    conditions = [
        df['created_at'] > df['scheduled_to'],
        df['created_at'] > df['dispatched_at'],
        df['created_at'] > df['arriving_push'],
        df['created_at'] > df['vehicle_arrived_at'],
        df['created_at'] > df['earliest_pickup_expectation'],
        df['created_at'] > df['pickup_first_eta'],
        df['created_at'] > df['pickup_eta'],
        df['created_at'] > df['pickup_at'],
        df['created_at'] > df['dropoff_first_eta'],
        df['created_at'] > df['dropoff_eta'],
        df['created_at'] > df['dropoff_at'],
        df['created_at'] > df['updated_at'],
        df['created_at'] > df['canceled_at']
        ]
        
    # create a list of the values we want to assign for each condition
    values = [
        'Created > Scheduled',
        'Created > Dispatched', 
        'Created > ArrivingPush', 
        'Created > VehicleArrivedAt', 
        'Created > EarliestPickupExpectations', 
        'Created > PickupFirstEta', 
        'Created > PickupEta', 
        'Created > PickupAt', 
        'Created > DropoffFirstEta', 
        'Created > DropoffEta', 
        'Created > DropoffAt', 
        'Created > UpdatedAt', 
        'Created > CanceledAt'
        ]
        
    # create error-column 
    df['Error: Created ordering'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: Created ordering'] 

# update the dataframe 
df['Error: Created ordering'] = check_created_ordering(df)


In [217]:
def check_scheduled_ordering(df):
    # create a list of our conditions
    conditions = [
        df['scheduled_to'] < df['dispatched_at'],
        df['scheduled_to'] > df['pickup_eta'],
        df['scheduled_to'] > df['pickup_at'],
        df['scheduled_to'] > df['dropoff_first_eta'],
        df['scheduled_to'] > df['dropoff_eta'],
        df['scheduled_to'] > df['dropoff_at'],
        df['scheduled_to'] > df['canceled_at']
        ]
        
    # create a list of the values we want to assign for each condition
    values = [        
        'Scheduled < Dispatched',                 
        'Scheduled > PickupEta', 
        'Scheduled > PickupAt', 
        'Scheduled > DropoffFirstEta', 
        'Scheduled > DropoffEta', 
        'Scheduled > DropoffAt', 
        'Scheduled > CanceledAt'
        ]
        
    # create error-column 
    df['Error: Scheduled ordering'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: Scheduled ordering']

# update the dataframe 
df['Error: Scheduled ordering'] = check_scheduled_ordering(df)

In [218]:
def check_dispatched_ordering(df):
    # create a list of our conditions
    conditions = [
        df['dispatched_at'] > df['arriving_push'],
        df['dispatched_at'] > df['vehicle_arrived_at'],
        df['dispatched_at'] > df['earliest_pickup_expectation'],
        df['dispatched_at'] > df['pickup_first_eta'],
        df['dispatched_at'] > df['pickup_eta'],
        df['dispatched_at'] > df['pickup_at'],
        df['dispatched_at'] > df['dropoff_first_eta'],
        df['dispatched_at'] > df['dropoff_eta'],
        df['dispatched_at'] > df['dropoff_at'],
        df['dispatched_at'] > df['updated_at'],
        df['dispatched_at'] > df['canceled_at']
        ]
        
    # create a list of the values we want to assign for each condition
    values = [
        'Dispatched > ArrivingPush', 
        'Dispatched > VehicleArrivedAt', 
        'Dispatched > EarliestPickupExpectations', 
        'Dispatched > PickupFirstEta', 
        'Dispatched > PickupEta', 
        'Dispatched > PickupAt', 
        'Dispatched > DropoffFirstEta', 
        'Dispatched > DropoffEta', 
        'Dispatched > DropoffAt', 
        'Dispatched > UpdatedAt', 
        'Dispatched > CanceledAt'
        ]
        
    # create error-column 
    df['Error: Dispatched ordering'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: Dispatched ordering']

# update the dataframe 
df['Error: Dispatched ordering'] = check_dispatched_ordering(df)

In [219]:
def check_arriving_push_ordering(df):
    # create a list of our conditions
    conditions = [
        df['arriving_push'] > df['vehicle_arrived_at'],
        df['arriving_push'] < df['earliest_pickup_expectation'],
        df['arriving_push'] > df['pickup_eta'],
        df['arriving_push'] > df['pickup_at'],
        df['arriving_push'] > df['dropoff_first_eta'],
        df['arriving_push'] > df['dropoff_eta'],
        df['arriving_push'] > df['dropoff_at'],
        df['arriving_push'] > df['updated_at'],
        df['arriving_push'] > df['canceled_at']
        ]
        
    # create a list of the values we want to assign for each condition
    values = [         
        'ArrivingPush > VehicleArrivedAt', 
        'ArrivingPush < EarliestPickupExpectations',  
        'ArrivingPush > PickupEta', 
        'ArrivingPush > PickupAt', 
        'ArrivingPush > DropoffFirstEta', 
        'ArrivingPush > DropoffEta', 
        'ArrivingPush > DropoffAt', 
        'ArrivingPush > UpdatedAt', 
        'ArrivingPush > CanceledAt'
        ]
        
    # create error-column 
    df['Error: ArrivingPush ordering'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: ArrivingPush ordering']

# update the dataframe 
df['Error: ArrivingPush ordering'] = check_arriving_push_ordering(df)

In [220]:
def check_vehicle_arrived_ordering(df):
    # create a list of our conditions
    conditions = [
        df['vehicle_arrived_at'] > df['earliest_pickup_expectation'],
        df['vehicle_arrived_at'] > df['pickup_first_eta'],
        df['vehicle_arrived_at'] > df['pickup_eta'],
        df['vehicle_arrived_at'] > df['pickup_at'],
        df['vehicle_arrived_at'] > df['dropoff_first_eta'],
        df['vehicle_arrived_at'] > df['dropoff_eta'],
        df['vehicle_arrived_at'] > df['dropoff_at'],
        df['vehicle_arrived_at'] > df['updated_at'],
        df['vehicle_arrived_at'] > df['canceled_at']
        ]
        
    # create a list of the values we want to assign for each condition
    values = [         
        'VehicleArrivedAt > EarliestPickupExpectations', 
        'VehicleArrivedAt > PickupFirstEta', 
        'VehicleArrivedAt > PickupEta', 
        'VehicleArrivedAt > PickupAt', 
        'VehicleArrivedAt > DropoffFirstEta', 
        'VehicleArrivedAt > DropoffEta', 
        'VehicleArrivedAt > DropoffAt', 
        'VehicleArrivedAt > UpdatedAt', 
        'VehicleArrivedAt > CanceledAt'
        ]
        
    # create error-column 
    df['Error: VehicleArrived ordering'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: VehicleArrived ordering']

# update the dataframe 
df['Error: VehicleArrived ordering'] = check_vehicle_arrived_ordering(df)

In [221]:
def check_eraliest_pickup_expectation_ordering(df):
    # create a list of our conditions
    conditions = [
        df['earliest_pickup_expectation'] > df['pickup_first_eta'],
        df['earliest_pickup_expectation'] > df['dropoff_first_eta'],
        df['earliest_pickup_expectation'] > df['dropoff_eta'],
        df['earliest_pickup_expectation'] > df['dropoff_at'],
        df['earliest_pickup_expectation'] > df['updated_at'],
        df['earliest_pickup_expectation'] > df['canceled_at']
        ]
        
    # create a list of the values we want to assign for each condition
    values = [         
        'EarliestPickupExpectations > PickupFirstEta',  
        'EarliestPickupExpectations > DropoffFirstEta', 
        'EarliestPickupExpectations > DropoffEta', 
        'EarliestPickupExpectations > DropoffAt', 
        'EarliestPickupExpectations > UpdatedAt', 
        'EarliestPickupExpectations > CanceledAt'
        ]
        
    # create error-column 
    df['Error: EarliestPickupExpect ordering'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: EarliestPickupExpect ordering']

# update the dataframe 
df['Error: EarliestPickupExpect ordering'] = check_eraliest_pickup_expectation_ordering(df)

In [222]:
def check_pickup_first_eta_ordering(df):
    # create a list of our conditions
    conditions = [             
        df['pickup_first_eta'] > df['dropoff_first_eta'],
        df['pickup_first_eta'] > df['dropoff_eta'],
        df['pickup_first_eta'] > df['dropoff_at'],
        df['pickup_first_eta'] > df['updated_at'],
        df['pickup_first_eta'] > df['canceled_at']
        ]
         
    # create a list of the values we want to assign for each condition
    values = [              
        'PickupFirstEta > DropoffFirstEta', 
        'PickupFirstEta > DropoffEta', 
        'PickupFirstEta > DropoffAt', 
        'PickupFirstEta > UpdatedAt', 
        'PickupFirstEta > CanceledAt'
        ]
        
    # create error-column 
    df['Error: PFE ordering'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: PFE ordering']

# update the dataframe 
df['Error: PFE ordering'] = check_pickup_first_eta_ordering(df)

In [223]:
def check_pickup_eta_ordering(df):
    # create a list of our conditions
    conditions = [
        df['pickup_eta'] > df['dropoff_first_eta'],
        df['pickup_eta'] > df['dropoff_eta'],
        df['pickup_eta'] > df['dropoff_at'],
        df['pickup_eta'] > df['updated_at'],
        df['pickup_eta'] > df['canceled_at']
        ]
         
    # create a list of the values we want to assign for each condition
    values = [             
        'PickupEta > DropoffFirstEta', 
        'PickupEta > DropoffEta', 
        'PickupEta > DropoffAt', 
        'PickupEta > UpdatedAt', 
        'PickupEta > CanceledAt'
        ]
        
    # create error-column 
    df['Error: PickupETA ordering'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: PickupETA ordering']

# update the dataframe 
df['Error: PickupETA ordering'] = check_pickup_eta_ordering(df)

In [224]:
def check_pickup_ordering(df):
    # create a list of our conditions
    conditions = [
        df['pickup_at'] > df['dropoff_first_eta'],
        df['pickup_at'] > df['dropoff_eta'],
        df['pickup_at'] > df['dropoff_at'],
        df['pickup_at'] > df['updated_at'],
        df['pickup_at'] > df['canceled_at']
        ]
         
    # create a list of the values we want to assign for each condition
    values = [                     
        'PickupAt > DropoffFirstEta', 
        'PickupAt > DropoffEta', 
        'PickupAt > DropoffAt', 
        'PickupAt > UpdatedAt', 
        'PickupAt > CanceledAt'
        ]
        
    # create error-column 
    df['Error: Pickup ordering'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: Pickup ordering']

# update the dataframe 
df['Error: Pickup ordering'] = check_pickup_ordering(df)

In [225]:
def check_dropoff_first_eta_ordering(df):
    # create a list of our conditions
    conditions = [
        df['dropoff_first_eta'] > df['dropoff_eta'],
        df['dropoff_first_eta'] > df['dropoff_at'],
        df['dropoff_first_eta'] > df['updated_at'],
        df['dropoff_first_eta'] > df['canceled_at']
        ]
         
    # create a list of the values we want to assign for each condition
    values = [                     
        'DropoffFirstEta > DropoffEta', 
        'DropoffFirstEta > DropoffAt', 
        'DropoffFirstEta > UpdatedAt', 
        'DropoffFirstEta > CanceledAt'
        ]
        
    # create error-column 
    df['Error: DFE ordering'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: DFE ordering']

# update the dataframe 
df['Error: DFE ordering'] = check_dropoff_first_eta_ordering(df)

In [226]:
def check_dropoff_eta_ordering(df):
    # create a list of our conditions
    conditions = [
        df['dropoff_eta'] > df['updated_at'],
        df['dropoff_eta'] > df['canceled_at']
        ]
         
    # create a list of the values we want to assign for each condition
    values = [                     
        'DropoffEta > UpdatedAt', 
        'DropoffEta > CanceledAt'
        ]
        
    # create error-column 
    df['Error: DropoffETA ordering'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: DropoffETA ordering']

# update the dataframe 
df['Error: DropoffETA ordering'] = check_dropoff_eta_ordering(df)

In [227]:
def check_dropoff_ordering(df):
    # create a list of our conditions
    conditions = [
        df['dropoff_at'] > df['updated_at'],
        df['dropoff_at'] > df['canceled_at']
        ]
         
    # create a list of the values we want to assign for each condition
    values = [                     
        'DropoffAt > UpdatedAt', 
        'DropoffAt > CanceledAt'
        ]
        
    # create error-column 
    df['Error: Dropoff ordering'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: Dropoff ordering']

# update the dataframe 
df['Error: Dropoff ordering'] = check_dropoff_ordering(df)

In [228]:
def check_updated_ordering(df):
    # create error-column 
    df['Error: Updated ordering'] = np.where(
    df['updated_at'] < df['canceled_at'],
    'Updated < Canceled', 
    'Correct'
    )
    return df['Error: Updated ordering']

# update the dataframe 
df['Error: Updated ordering'] = check_updated_ordering(df)

### Check time calculations

##### Dispatched

In [253]:
# check if dispatched_at was calculated correctly 
def check_dispatched_at_column(df):
    df['Error: Dispatched Calc.: Scheduled - 8 Minutes'] = np.where(
        ~df['dispatched_at'].isna(),
        np.where(
            df['scheduled_to'] == df['created_at'],
            np.where(
                df['dispatched_at'] == df['created_at'],
                'Correct',
                'Dispatched != Created'              # dispatched and created are not the same 
            ),
            np.where(
                (
                    (df['scheduled_to'] > df['created_at'] + pd.Timedelta(minutes=8))
                    & (df['dispatched_at'] == df['scheduled_to'] - pd.Timedelta(minutes=8))
                ) |
                (
                    (df['scheduled_to'] < df['created_at'] + pd.Timedelta(minutes=8))
                    & (df['dispatched_at'] == df['created_at'])
                ),
                'Correct',              # right calculation  
                'Wrong Calculation'     # wrong calculation  
            )
        ),
        'Dispatched was empty'
    )
    return df['Error: Dispatched Calc.: Scheduled - 8 Minutes']

# update the dataframe 
df['Error: Dispatched Calc.: Scheduled - 8 Minutes'] = check_dispatched_at_column(df)

##### Pickup Arrival Time 

In [230]:
# check if pickup_arrival_time was calculated correctly 
def check_pickup_arrival_time(df):
      # create help column 
    df['pickup_arrival_time2'] = (df['vehicle_arrived_at'] - df['dispatched_at']).dt.seconds

    df['Error: PickupArrivalTime Calc.: VehicleArrivedAt - DispatchedAt'] = np.where(
        ~df['pickup_arrival_time'].isna(),
        np.where(
           df['pickup_arrival_time'] == df['pickup_arrival_time2'].dt.seconds,
            'Correct',              # right calculation of scheduled
            'Wrong Calculation'     # wrong calculation of scheduled
        ),
        'Value is NaN'
    )
    return df['Error: PickupArrivalTime Calc.: VehicleArrivedAt - DispatchedAt']

# update the dataframe 
df['Error: PickupArrivalTime Calc.: VehicleArrivedAt - DispatchedAt'] = check_pickup_arrival_time(df)
# delete help column
df = df.drop('pickup_arrival_time2', 1)

  df = df.drop('pickup_arrival_time2', 1)


##### Earliest Pickup Expectation

In [231]:
# check if earliest_pickup_expectation was calculated correctly 
def check_earliest_pickup_expectation(df):
    df['Error: EarliestPickupExpect. Calc.: DispatchedAt + 3 Min'] = np.where(
        ~df['earliest_pickup_expectation'].isna(),
        np.where(
           pd.to_datetime(df['earliest_pickup_expectation']) == df['dispatched_at'] + pd.Timedelta(minutes=3),
            'Correct',              # right calculation 
            'Wrong Calculation'     # wrong calculation 
        ),
        'Value is NaN'
    )
    return df['Error: EarliestPickupExpect. Calc.: DispatchedAt + 3 Min']

# update the dataframe 
df['Error: EarliestPickupExpect. Calc.: DispatchedAt + 3 Min'] = check_earliest_pickup_expectation(df)

##### Pickup 1.ETA

In [232]:
# check if pickup_first_eta was calculated correctly, respectively minimum > created_at + 3 minutes 
def check_pickup_first_eta(df):
    df['Error: PickupFirstEta 3 Minute Range'] = np.where(
        ~df['pickup_first_eta'].isna(),
        np.where(
           pd.to_datetime(df['pickup_first_eta']) >= df['created_at'] + pd.Timedelta(minutes=3.1),
            'Correct',                              # right calculation
            'Not Minimmum created at + 3 min'       # not minimum created_at + 3 minutes 
        ),
        'Value is NaN'
    )
    return df['Error: PickupFirstEta 3 Minute Range']

# update the dataframe 
df['Error: PickupFirstEta 3 Minute Range'] = check_pickup_first_eta(df)

In [233]:
def convert(row):

    row = row % (24 * 3600)
    hour = row // 3600
    row %= 3600
    minutes = row // 60
    row %= 60
      
    return "%d:%02d:%02d" % (hour, minutes, row)

##### Dropff 1. ETA

In [234]:
# check if dropff_first was calculated correctly
def check_dropoff_first_eta(df):
    df.shortest_ridetime = df.shortest_ridetime.apply(convert)
    ftr = [3600, 60, 1]
    shortest_ridetime = (
        df["shortest_ridetime"]
        .str[0:8]
        .apply(lambda row: sum([a * b for a, b in zip(ftr, map(int, row.split(":")))]))
    )
    df['Error: DropoffFirstEta Calc.: PickupFirstEta + ShortestRideTime'] = np.where(
        ~df['dropoff_first_eta'].isna(),
        np.where(
           df['dropoff_first_eta'] == df['pickup_first_eta'] + pd.to_timedelta(shortest_ridetime, unit='s'),
            'Correct',              # right calculation 
            'Wrong Calculation'     # wrong calculation 
        ),
        'Value is NaN'
    )
    return df['Error: DropoffFirstEta Calc.: PickupFirstEta + ShortestRideTime']

# update the dataframe 
df['Error: DropoffFirstEta Calc.: PickupFirstEta + ShortestRideTime'] = check_dropoff_first_eta(df)

##### Arrival Deviation

In [235]:
# check if arrival deviaion was calculated correctly 
def check_arrival_deviation(df):
    # create help column 
    df['arrival_deviation2'] = df.apply(
        lambda row: (
            (row['vehicle_arrived_at'] - row['arriving_push']).round(freq="s")
        ).total_seconds()
        - 180, 
        axis=1,
    )
    # check whether the calculated help column match with the actual column 
    df['Error: ArrivalDeviation Calc.: (ArrivedAt - ArrivingPush) - 3 Min'] = np.where(
        ~df['arrival_deviation'].isna(),
        np.where( 
            df['arrival_deviation'] == df['arrival_deviation2'],
            'Correct',              # right calculation
            'Wrong Calculation'     # wrong calculationt
        ),
        'Value is NaN',
    )
    return df['Error: ArrivalDeviation Calc.: (ArrivedAt - ArrivingPush) - 3 Min']

# update the dataframe 
df['Error: ArrivalDeviation Calc.: (ArrivedAt - ArrivingPush) - 3 Min'] = check_arrival_deviation(df)
# delete help column
df = df.drop('arrival_deviation2', 1)

  df = df.drop('arrival_deviation2', 1)


##### Waiting time 

In [236]:
# check if WaitingTime was calculated correctly
def check_waiting_time(df):
     # create help column 
    df['waiting_time2'] = (df['vehicle_arrived_at'] - df['earliest_pickup_expectation']).dt.seconds
    
    df['Error: WaitingTime Calc.: ArrivedAt - EarliestPickupExpect.'] = np.where(
        ~df['waiting_time'].isna(),
        np.where(
            df['waiting_time'] == df['waiting_time2'],
            'Correct',              # right calculation
            'Wrong Calculation'     # wrong calculation
        ),
        'Value is NaN'
    )
    return df['Error: WaitingTime Calc.: ArrivedAt - EarliestPickupExpect.']

# update the dataframe 
df['Error: WaitingTime Calc.: ArrivedAt - EarliestPickupExpect.'] = check_waiting_time(df)
# delete help column
df = df.drop('waiting_time2', 1)

  df = df.drop('waiting_time2', 1)


##### Boarding time

In [237]:
# check if boarding time was calculated correctly
def check_boarding_time(df):
    # create help column 
    df['boarding_time2'] = (df['pickup_at'] - df['vehicle_arrived_at']).dt.seconds

    df['Error: BoardingTime Calc.: PickupAt - ArrivedAt'] = np.where(
        ~df['boarding_time'].isna(),
        np.where(
           df['boarding_time'] == df['boarding_time2'],
            'Correct',              # right calculation
            'Wrong Calculation'     # wrong calculation
        ),
        'Value is NaN'
    )
    return df['Error: BoardingTime Calc.: PickupAt - ArrivedAt']

# update the dataframe 
df['Error: BoardingTime Calc.: PickupAt - ArrivedAt'] = check_boarding_time(df)
# delete help column
df = df.drop('boarding_time2', 1)

  df = df.drop('boarding_time2', 1)


##### Ride time

In [238]:
# check if riding time was calculated correctly
def check_riding_time(df):
    # create help column 
    df['ride_time2'] = (df['dropoff_at'] - df['pickup_at']).dt.seconds
    
    df['Error: RidingTime Calc.: DropoffAt - PickupAt'] = np.where(
        ~df['ride_time'].isna(),
        np.where(
           df['ride_time'] == df['ride_time2'],
            'Correct',              # right calculation
            'Wrong Calculation'     # wrong calculation
        ),
        'Value is NaN'
    )
    return df['Error: RidingTime Calc.: DropoffAt - PickupAt']

# update the dataframe 
df['Error: RidingTime Calc.: DropoffAt - PickupAt'] = check_riding_time(df)
# delete help column
df = df.drop('ride_time2', 1)


  df = df.drop('ride_time2', 1)


##### Trip time

In [239]:
# Attribute: 'waiting_time'
df['waiting_time2'] = (df['vehicle_arrived_at'] - df['earliest_pickup_expectation']).dt.seconds

# Attribute: 'ride_time'
df['ride_time2'] = (df['dropoff_at'] - df['pickup_at']).dt.seconds

# Attribute: 'trip_time'
df["trip_time2"] = df.apply(
    lambda row: (row["ride_time2"] + row["waiting_time2"]),
    axis=1,
)

# check if trip time was calculated correctly
def check_trip_time(df):

    df['Error: TripTime Calc.: RideTime + WaitingTime'] = np.where(
        ~df['trip_time'].isna(),
        np.where(
           df['trip_time'] == df['trip_time2'],
            'Correct',              # right calculation
            'Wrong Calculation'     # wrong calculation
        ),
        'Value is NaN'
    )
    return df['Error: TripTime Calc.: RideTime + WaitingTime']

# update the dataframe 
df['Error: TripTime Calc.: RideTime + WaitingTime'] = check_trip_time(df)
# delete help column
df = df.drop('waiting_time2', 1)
df = df.drop('ride_time2', 1)

  df = df.drop('trip_time3', 1)
  df = df.drop('waiting_time2', 1)
  df = df.drop('ride_time2', 1)


##### Delay

In [240]:
# check if delaytime was calculated correctly
def check_delay_time(df):
    df.delay = df.delay.apply(convert)

    # create help column for comparison
    df["delay2"] = df.apply(
        lambda row: round(row["trip_time2"] - row["shortest_ridetime"], 2)
        if (row["state"] == "completed")
        else np.NaN,
        axis=1,
    )

    # comparison 
    df['Error: DelayTime Calc.: TripTime - ShortestRideTime (+- 2 seconds of rounding tolerance)'] = np.where(
        ~df['delay'].isna(),
        np.where(
            (df['delay'] > df['delay2'] - 2) & (df['delay'] < df['delay2'] + 2) # rounding tolerance
            'Correct',              # right calculation
            'Wrong Calculation'     # wrong calculation
        ),
        'Value is NaN'
    )
    return df['Error: DelayTime Calc.: TripTime - ShortestRideTime (+- 2 seconds of rounding tolerance)']

# update the dataframe 
df['Error: DelayTime Calc.: TripTime - ShortestRideTime (+- 2 seconds of rounding tolerance)'] = check_delay_time(df)
# delete help column
df = df.drop('trip_time2', 1)
df = df.drop('delay2', 1)

  df = df.drop('shortest_ridetime_s', 1)
  df = df.drop('trip_time2', 1)
  df = df.drop('delay2', 1)


##### Longer route factor 

In [241]:
# check if longer route factor was calculated correctly
def check_lrf_calculation(df):
    # create help column ride time in seconds
    df['ride_time_s'] = (df['dropoff_at'] - df['pickup_at']).dt.seconds

     # create help column for comparison
    df["longer_route_factor2"] = df.apply(
        lambda row: round(row["ride_time_s"] / row["shortest_ridetime"], 2)
        if (row["state"] == "completed") & (row["shortest_ridetime"] != 0)
        else np.NaN,
        axis=1,
    )
    # comparison
    df['Error: LongerRouteFactor Calc.'] = np.where(
        ~df['longer_route_factor'].isna(),
        np.where(
           (df['longer_route_factor'] > df['longer_route_factor2'] - .1)
           & (df['longer_route_factor'] < df['longer_route_factor2'] + .1), # rounding tolerance
            'Correct',              # right calculation
            'Wrong Calculation'     # wrong calculation
        ),
        'Value is NaN'
    )
    return df['Error: LongerRouteFactor Calc.']

# update the dataframe 
df['Error: LongerRouteFactor Calc.'] = check_lrf_calculation(df)
# delete help column
df = df.drop('shortest_ridetime_s', 1)
df = df.drop('ride_time_s', 1)
df = df.drop('longer_route_factor2', 1)

  df = df.drop('shortest_ridetime_s', 1)
  df = df.drop('ride_time_s', 1)
  df = df.drop('longer_route_factor2', 1)


### States checking 

##### Canceled 

In [242]:
# check if if there is a pickup_at time although the ride was canceled
def check_canceled_state(df):
   
   # create error-column 
    df['Error: State canceled'] = np.where(
        (df['state'] == 'canceled') & (~df['pickup_at'].isna()),
            'Canceled but picked up',
            'Correct'
        )
    return df['Error: State canceled']

# update the dataframe 
df['Error: State canceled'] = check_canceled_state(df)

##### Completed

In [243]:
# check if state is completed but there is no vehicle_arrived_at time   
def check_completed_state(df):
   
    # create error-column 
    df['Error: Completed state w/o vehicle arrived'] = np.where(
    (df['state'] == 'completed') & (df['vehicle_arrived_at'].isna()),
    'Completed but non vehicle arrived', 
    'Correct'
    )
    return df['Error: Completed state w/o vehicle arrived'] 

# update the dataframe 
df['Error: Completed state w/o vehicle arrived'] = check_completed_state(df)

In [244]:
# check if state is completed but there is no vehicle_arrived_at time   
def check_completed_state(df):
   
    # create error-column 
    df['Error: Completed state w/o pickup_at'] = np.where(
    (df['state'] == 'completed') & (df['pickup_at'].isna()),
    'Completed but non pickup_at', 
    'Correct'
    )
    return df['Error: Completed state w/o pickup_at'] 

# update the dataframe 
df['Error: Completed state w/o pickup_at'] = check_completed_state(df)

In [245]:
# check if state is completed but there is no vehicle_arrived_at time   
def check_completed_state(df):
   
    # create error-column 
    df['Error: Completed state w/o dropoff_at'] = np.where(
    (df['state'] == 'completed') & (df['dropoff_at'].isna()),
    'Completed but non dropoff_at', 
    'Correct'
    )
    return df['Error: Completed state w/o dropoff_at'] 

# update the dataframe 
df['Error: Completed state w/o dropoff_at'] = check_completed_state(df)

##### Offer

In [246]:
# check for presences/absence of timestamps for status offer
def check_offer_timestamps(df):
    # create help column 
    conditions = [
        (df['state'] == 'offer') & df['created_at'].isna() ,
        (df['state'] == 'offer') & df['dispatched_at'].isna(),
        (df['state'] == 'offer') & ~df['arriving_push'].isna(),
        (df['state'] == 'offer') & ~df['vehicle_arrived_at'].isna(),
        (df['state'] == 'offer') & df['earliest_pickup_expectation'].isna(),
        (df['state'] == 'offer') & ~df['pickup_first_eta'].isna(),
        (df['state'] == 'offer') & ~df['pickup_eta'].isna(),
        (df['state'] == 'offer') & ~df['pickup_at'].isna(),
        (df['state'] == 'offer') & ~df['dropoff_first_eta'].isna(),
        (df['state'] == 'offer') & ~df['dropoff_eta'].isna(),
        (df['state'] == 'offer') & ~df['dropoff_at'].isna(),
        (df['state'] == 'offer') & df['updated_at'].isna(),
    ]
    # create a list of the values we want to assign for each condition
    values = values = ['created_at missing', 'dispatched_at missing', 
    'arriving_push present', 'vehicle_arrived_at present', 'earliest_pickup_expectation missing', 
    'pickup_first_eta present', 'pickup_eta present', 'pickup_at present', 'dropoff_first_eta present', 
    'dropoff_eta present', 'dropoff_at present', 'updated_at missing']
    
    df['Error: Offer Timestamps'] = np.select(
    conditions,
    values, 
    'Correct'
    )
    return df['Error: Offer Timestamps']

# update the dataframe 
df['Error: Offer Timestamps'] = check_offer_timestamps(df)


### Price Operations 

In [247]:
# check if non-free ride but price was not payed although there is a arriving push
def check_not_payed(df):
   
    # create error-column 
    df['Error: Price not payed although non-free'] = np.where(
    (~df['arriving_push'].isna()) & (df['price_payed'] == 0) & (df['free_ride'] == 0),
    'Not Paid', 
    'Correct'
    )
    return df['Error: Price not payed although non-free'] 

# update the dataframe 
df['Error: Price not payed although non-free'] = check_not_payed(df)

In [248]:
# check if it was a completed and free ride but price was payed
def check_payed_but_free(df):
   
    # create error-column 
    df['Error: Price payed although free'] = np.where(
    (df['state'] == 'completed') & (df['price_payed'] != 0) & (df['free_ride'] == 1),
    'Paid', 
    'Correct'
    )
    return df['Error: Price payed although free'] 

# update the dataframe 
df['Error: Price payed although free'] = check_payed_but_free(df)

### Ride ID

In [249]:
# check for duplicates in ride id for status other than offer/offer_rejected
def check_duplicate_ride(df):
   
    # create error-column 
    df['Error: Duplicate Ride id'] = np.where(
    (df['id'].duplicated(keep=False)) & ((~df['id'].isna())),
    'Duplicate Ride id', 
    'Correct'
    )
    return df['Error: Duplicate Ride id'] 
    
# update the dataframe 
df['Error: Duplicate Ride id'] = check_duplicate_ride(df)


In [250]:
# check if a ride id was given for status offer or offer-rejeceted
def check_offer_ride_id(df):
   
    # create error-column 
    df['Error: Offer w Ride id'] = np.where(
    ((df['state'] == 'offer') | (df['state'] == 'offer-rejected')) & (~df['id'].isna()),
    'Ride id in Offer', 
    'Correct'
    )
    return df['Error: Offer w Ride id'] 

# update the dataframe 
df['Error: Offer w Ride id'] = check_offer_ride_id(df)


In [251]:
# delete columns which are empty for better overview 
df.dropna(how='all', axis=1, inplace=True)
# delete first to automatically generated columns 
df = df.iloc[: , 1:]

In [252]:
# create excel writer object
df.to_excel("simulation_check2.xlsx")

In [None]:
# TODO: check bei möglichen flags die Verteilung zwischen simulated & originalen Rides hinsichtlich Häufigkeiten und ggf. Ausmaß