In [1]:
import pandas as pd
import numpy as np
from pyrsistent import v
from numpy import NaN
from datetime import datetime as dt
from datetime import timedelta

In [2]:
# read rides_combined.csv - change to your loaction
df = pd.read_csv(
    "/Users/ericchittka/rides_combined.csv"
)
# check the types of the dataframe columns 
df.dtypes

Unnamed: 0                       int64
id                              object
user_id                         object
distance                         int64
number_of_passenger            float64
price_operations               float64
price_offer                    float64
price_payed                    float64
free_ride                      float64
payment_type                    object
pickup_address                  object
dropoff_address                 object
state                           object
created_from_offer             float64
created_at                      object
scheduled_to                    object
dispatched_at                   object
pickup_arrival_time             object
arriving_push                   object
vehicle_arrived_at              object
earliest_pickup_expectation     object
pickup_first_eta                object
pickup_eta                      object
pickup_at                       object
dropoff_first_eta               object
dropoff_eta              

In [3]:
# convert all column with dates into datetime objects 
df['created_at'] = pd.to_datetime(df['created_at'], format='%Y/%m/%d %H:%M:%S')
df['scheduled_to'] = pd.to_datetime(df['scheduled_to'], format='%Y/%m/%d %H:%M:%S')
df['dispatched_at'] = pd.to_datetime(df['dispatched_at'], format='%Y/%m/%d %H:%M:%S')
df['canceled_at'] = pd.to_datetime(df['canceled_at'], format='%Y/%m/%d %H:%M:%S')
df['arriving_push'] = pd.to_datetime(df['arriving_push'], format='%Y/%m/%d %H:%M:%S')
df['vehicle_arrived_at'] = pd.to_datetime(df['vehicle_arrived_at'], format='%Y/%m/%d %H:%M:%S')
df['earliest_pickup_expectation'] = pd.to_datetime(df['earliest_pickup_expectation'], format='%Y/%m/%d %H:%M:%S')
df['pickup_first_eta'] = pd.to_datetime(df['pickup_first_eta'], format='%Y/%m/%d %H:%M:%S')
df['pickup_eta'] = pd.to_datetime(df['pickup_eta'], format='%Y/%m/%d %H:%M:%S')
df['pickup_at'] = pd.to_datetime(df['pickup_at'], format='%Y/%m/%d %H:%M:%S')
df['dropoff_first_eta'] = pd.to_datetime(df['dropoff_first_eta'], format='%Y/%m/%d %H:%M:%S')
df['dropoff_eta'] = pd.to_datetime(df['dropoff_eta'], format='%Y/%m/%d %H:%M:%S')
df['dropoff_at'] = pd.to_datetime(df['dropoff_at'], format='%Y/%m/%d %H:%M:%S')
df['updated_at'] = pd.to_datetime(df['updated_at'], format='%Y/%m/%d %H:%M:%S')
df['pickup_arrival_time'] = pd.to_datetime(df['pickup_arrival_time'])
df['pickup_arrival_time'] = df['pickup_arrival_time'].dt.strftime('%H:%M:%S')
df['waiting_time'] = pd.to_datetime(df['waiting_time'])
df['waiting_time'] = df['waiting_time'].dt.strftime('%H:%M:%S')
df['boarding_time'] = pd.to_datetime(df['boarding_time'])
df['boarding_time'] = df['boarding_time'].dt.strftime('%H:%M:%S')
df['ride_time'] = pd.to_datetime(df['ride_time'])
df['ride_time'] = df['ride_time'].dt.strftime('%H:%M:%S')
df['trip_time'] = pd.to_datetime(df['trip_time'])
df['trip_time'] = df['trip_time'].dt.strftime('%H:%M:%S')
df['delay'] = pd.to_datetime(df['delay'])
df['delay'] = df['delay'].dt.strftime('%H:%M:%S')

### Timestamp Ordering 

In [4]:
# check if clean is before the scheduled time  
def check_created_ordering(df):
    # create a list of our conditions
    conditions = [
        df['created_at'] > df['scheduled_to'],
        df['created_at'] > df['dispatched_at'],
        df['created_at'] > df['arriving_push'],
        df['created_at'] > df['vehicle_arrived_at'],
        df['created_at'] > df['earliest_pickup_expectation'],
        df['created_at'] > df['pickup_first_eta'],
        df['created_at'] > df['pickup_eta'],
        df['created_at'] > df['pickup_at'],
        df['created_at'] > df['dropoff_first_eta'],
        df['created_at'] > df['dropoff_eta'],
        df['created_at'] > df['dropoff_at'],
        df['created_at'] > df['updated_at'],
        df['created_at'] > df['canceled_at']
        ]
        
    # create a list of the values we want to assign for each condition
    values = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13']
        
    # create error-column 
    df['Error: Created ordering'] = np.select(
    conditions,
    values, 
    '0'
    )
    return df['Error: Created ordering'] 

# update the dataframe 
df['Error: Created ordering'] = check_created_ordering(df)


In [5]:
def check_scheduled_ordering(df):
    # create a list of our conditions
    conditions = [
        df['scheduled_to'] < df['dispatched_at'],
        df['scheduled_to'] > df['vehicle_arrived_at'],
        df['scheduled_to'] > df['pickup_first_eta'],
        df['scheduled_to'] > df['pickup_eta'],
        df['scheduled_to'] > df['pickup_at'],
        df['scheduled_to'] > df['dropoff_first_eta'],
        df['scheduled_to'] > df['dropoff_eta'],
        df['scheduled_to'] > df['dropoff_at'],
        df['scheduled_to'] > df['updated_at'],
        df['scheduled_to'] > df['canceled_at']
        ]
        
    # create a list of the values we want to assign for each condition
    values = ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10']
        
    # create error-column 
    df['Error: Scheduled ordering'] = np.select(
    conditions,
    values, 
    '0'
    )
    return df['Error: Scheduled ordering']

# update the dataframe 
df['Error: Scheduled ordering'] = check_scheduled_ordering(df)

In [6]:
def check_dispatched_ordering(df):
    # create a list of our conditions
    conditions = [
        df['dispatched_at'] > df['arriving_push'],
        df['dispatched_at'] > df['vehicle_arrived_at'],
        df['dispatched_at'] > df['earliest_pickup_expectation'],
        df['dispatched_at'] > df['pickup_first_eta'],
        df['dispatched_at'] > df['pickup_eta'],
        df['dispatched_at'] > df['pickup_at'],
        df['dispatched_at'] > df['dropoff_first_eta'],
        df['dispatched_at'] > df['dropoff_eta'],
        df['dispatched_at'] > df['dropoff_at'],
        df['dispatched_at'] > df['updated_at'],
        df['dispatched_at'] > df['canceled_at']
        ]
        
    # create a list of the values we want to assign for each condition
    values = ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11']
        
    # create error-column 
    df['Error: Dispatched ordering'] = np.select(
    conditions,
    values, 
    '0'
    )
    return df['Error: Dispatched ordering']

# update the dataframe 
df['Error: Dispatched ordering'] = check_dispatched_ordering(df)

In [7]:
def check_arriving_push_ordering(df):
    # create a list of our conditions
    conditions = [
        df['arriving_push'] > df['vehicle_arrived_at'],
        df['arriving_push'] < df['earliest_pickup_expectation'],
        df['arriving_push'] > df['pickup_first_eta'],
        df['arriving_push'] > df['pickup_eta'],
        df['arriving_push'] > df['pickup_at'],
        df['arriving_push'] > df['dropoff_first_eta'],
        df['arriving_push'] > df['dropoff_eta'],
        df['arriving_push'] > df['dropoff_at'],
        df['arriving_push'] > df['updated_at'],
        df['arriving_push'] > df['canceled_at']
        ]
        
    # create a list of the values we want to assign for each condition
    values = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10']
        
    # create error-column 
    df['Error: ArrivingPush ordering'] = np.select(
    conditions,
    values, 
    '0'
    )
    return df['Error: ArrivingPush ordering']

# update the dataframe 
df['Error: ArrivingPush ordering'] = check_arriving_push_ordering(df)

In [8]:
def check_vehicle_arrived_ordering(df):
    # create a list of our conditions
    conditions = [
        df['vehicle_arrived_at'] > df['earliest_pickup_expectation'],
        df['vehicle_arrived_at'] > df['pickup_first_eta'],
        df['vehicle_arrived_at'] > df['pickup_eta'],
        df['vehicle_arrived_at'] > df['pickup_at'],
        df['vehicle_arrived_at'] > df['dropoff_first_eta'],
        df['vehicle_arrived_at'] > df['dropoff_eta'],
        df['vehicle_arrived_at'] > df['dropoff_at'],
        df['vehicle_arrived_at'] > df['updated_at'],
        df['vehicle_arrived_at'] > df['canceled_at']
        ]
        
    # create a list of the values we want to assign for each condition
    values = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']
        
    # create error-column 
    df['Error: VehicleArrived ordering'] = np.select(
    conditions,
    values, 
    '0'
    )
    return df['Error: VehicleArrived ordering']

# update the dataframe 
df['Error: VehicleArrived ordering'] = check_vehicle_arrived_ordering(df)

In [9]:
def check_eraliest_pickup_expectation_ordering(df):
    # create a list of our conditions
    conditions = [
        df['earliest_pickup_expectation'] > df['pickup_first_eta'],
        df['earliest_pickup_expectation'] > df['pickup_eta'],
        df['earliest_pickup_expectation'] > df['pickup_at'],
        df['earliest_pickup_expectation'] > df['dropoff_first_eta'],
        df['earliest_pickup_expectation'] > df['dropoff_eta'],
        df['earliest_pickup_expectation'] > df['dropoff_at'],
        df['earliest_pickup_expectation'] > df['updated_at'],
        df['earliest_pickup_expectation'] > df['canceled_at']
        ]
        
    # create a list of the values we want to assign for each condition
    values = ['E1', 'E2', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8']
        
    # create error-column 
    df['Error: EarliestPickupExpect ordering'] = np.select(
    conditions,
    values, 
    '0'
    )
    return df['Error: EarliestPickupExpect ordering']

# update the dataframe 
df['Error: EarliestPickupExpect ordering'] = check_eraliest_pickup_expectation_ordering(df)

In [10]:
def check_pickup_first_eta_ordering(df):
    # create a list of our conditions
    conditions = [
        df['pickup_first_eta'] > df['pickup_eta'],
        df['pickup_first_eta'] > df['pickup_at'],
        df['pickup_first_eta'] > df['dropoff_first_eta'],
        df['pickup_first_eta'] > df['dropoff_eta'],
        df['pickup_first_eta'] > df['dropoff_at'],
        df['pickup_first_eta'] > df['updated_at'],
        df['pickup_first_eta'] > df['canceled_at']
        ]
         
    # create a list of the values we want to assign for each condition
    values = ['PFE1', 'PFE2', 'PFE3', 'PFE4', 'PFE5', 'PFE6', 'PFE7']
        
    # create error-column 
    df['Error: PFE ordering'] = np.select(
    conditions,
    values, 
    '0'
    )
    return df['Error: PFE ordering']

# update the dataframe 
df['Error: PFE ordering'] = check_pickup_first_eta_ordering(df)

In [11]:
def check_pickup_eta_ordering(df):
    # create a list of our conditions
    conditions = [
        df['pickup_eta'] > df['pickup_at'],
        df['pickup_eta'] > df['dropoff_first_eta'],
        df['pickup_eta'] > df['dropoff_eta'],
        df['pickup_eta'] > df['dropoff_at'],
        df['pickup_eta'] > df['updated_at'],
        df['pickup_eta'] > df['canceled_at']
        ]
         
    # create a list of the values we want to assign for each condition
    values = ['PE1', 'PE2', 'PE3', 'PE4', 'PE5', 'PE6']
        
    # create error-column 
    df['Error: PickupETA ordering'] = np.select(
    conditions,
    values, 
    '0'
    )
    return df['Error: PickupETA ordering']

# update the dataframe 
df['Error: PickupETA ordering'] = check_pickup_eta_ordering(df)

In [12]:
def check_pickup_ordering(df):
    # create a list of our conditions
    conditions = [
        df['pickup_at'] > df['dropoff_first_eta'],
        df['pickup_at'] > df['dropoff_eta'],
        df['pickup_at'] > df['dropoff_at'],
        df['pickup_at'] > df['updated_at'],
        df['pickup_at'] > df['canceled_at']
        ]
         
    # create a list of the values we want to assign for each condition
    values = ['P1', 'P2', 'P3', 'P4', 'P5']
        
    # create error-column 
    df['Error: Pickup ordering'] = np.select(
    conditions,
    values, 
    '0'
    )
    return df['Error: Pickup ordering']

# update the dataframe 
df['Error: Pickup ordering'] = check_pickup_ordering(df)

In [13]:
def check_dropoff_first_eta_ordering(df):
    # create a list of our conditions
    conditions = [
        df['dropoff_first_eta'] > df['dropoff_eta'],
        df['dropoff_first_eta'] > df['dropoff_at'],
        df['dropoff_first_eta'] > df['updated_at'],
        df['dropoff_first_eta'] > df['canceled_at']
        ]
         
    # create a list of the values we want to assign for each condition
    values = ['DFE1', 'DFE2', 'DFE3', 'DFE4']
        
    # create error-column 
    df['Error: DFE ordering'] = np.select(
    conditions,
    values, 
    '0'
    )
    return df['Error: DFE ordering']

# update the dataframe 
df['Error: DFE ordering'] = check_dropoff_first_eta_ordering(df)

In [14]:
def check_dropoff_eta_ordering(df):
    # create a list of our conditions
    conditions = [
        df['dropoff_eta'] > df['dropoff_at'],
        df['dropoff_eta'] > df['updated_at'],
        df['dropoff_eta'] > df['canceled_at']
        ]
         
    # create a list of the values we want to assign for each condition
    values = ['DE1', 'DE2', 'DE3']
        
    # create error-column 
    df['Error: DropoffETA ordering'] = np.select(
    conditions,
    values, 
    '0'
    )
    return df['Error: DropoffETA ordering']

# update the dataframe 
df['Error: DropoffETA ordering'] = check_dropoff_eta_ordering(df)

In [15]:
def check_dropoff_ordering(df):
    # create a list of our conditions
    conditions = [
        df['dropoff_at'] > df['updated_at'],
        df['dropoff_at'] > df['canceled_at']
        ]
         
    # create a list of the values we want to assign for each condition
    values = ['DO1', 'DO2']
        
    # create error-column 
    df['Error: Dropoff ordering'] = np.select(
    conditions,
    values, 
    '0'
    )
    return df['Error: Dropoff ordering']

# update the dataframe 
df['Error: Dropoff ordering'] = check_dropoff_ordering(df)

In [16]:
def check_updated_ordering(df):
    # create error-column 
    df['Error: Updated ordering'] = np.where(
    df['updated_at'] < df['canceled_at'],
    'U1', 
    '0'
    )
    return df['Error: Updated ordering']

# update the dataframe 
df['Error: Updated ordering'] = check_updated_ordering(df)

### Check time calculations

In [None]:
# check if scheduled_to calculated correctly 
def check_scheduled_to(df):
    df['Errorcode: 1'] = np.where(
        df['dispatched_at'] + timedelta(minutes=8) != df['scheduled_to'],
        'S1',
        '0'
    )
    return df['Errorcode: 1']

# update the dataframe 
df['Errorcode: 1'] = check_scheduled_to(df)

In [None]:
df.to_excel(r'/Users/ericchittka/Downloads/filtered_df.xlsx')