In [270]:
import pandas as pd
import numpy as np
from pyrsistent import v
from numpy import NaN
from datetime import datetime as dt
from datetime import timedelta

In [271]:
# read rides_combined.csv - change to your loaction
df = pd.read_csv(
    "/Users/ericchittka/rides_combined.csv"
)
# check the types of the dataframe columns 
df.dtypes

Unnamed: 0                       int64
id                              object
user_id                         object
distance                         int64
number_of_passenger            float64
price_operations               float64
price_offer                    float64
price_payed                    float64
free_ride                      float64
payment_type                    object
pickup_address                  object
dropoff_address                 object
state                           object
created_from_offer             float64
created_at                      object
scheduled_to                    object
dispatched_at                   object
pickup_arrival_time             object
arriving_push                   object
vehicle_arrived_at              object
earliest_pickup_expectation     object
pickup_first_eta                object
pickup_eta                      object
pickup_at                       object
dropoff_first_eta               object
dropoff_eta              

In [272]:
# convert all column with dates into datetime objects 
df['created_at'] = pd.to_datetime(df['created_at'], format='%Y/%m/%d %H:%M:%S')
df['scheduled_to'] = pd.to_datetime(df['scheduled_to'], format='%Y/%m/%d %H:%M:%S')
df['dispatched_at'] = pd.to_datetime(df['dispatched_at'], format='%Y/%m/%d %H:%M:%S')
df['canceled_at'] = pd.to_datetime(df['canceled_at'], format='%Y/%m/%d %H:%M:%S')
df['arriving_push'] = pd.to_datetime(df['arriving_push'], format='%Y/%m/%d %H:%M:%S')
df['vehicle_arrived_at'] = pd.to_datetime(df['vehicle_arrived_at'], format='%Y/%m/%d %H:%M:%S')
df['earliest_pickup_expectation'] = pd.to_datetime(df['earliest_pickup_expectation'], format='%Y/%m/%d %H:%M:%S')
df['pickup_first_eta'] = pd.to_datetime(df['pickup_first_eta'], format='%Y/%m/%d %H:%M:%S')
df['pickup_eta'] = pd.to_datetime(df['pickup_eta'], format='%Y/%m/%d %H:%M:%S')
df['pickup_at'] = pd.to_datetime(df['pickup_at'], format='%Y/%m/%d %H:%M:%S')
df['dropoff_first_eta'] = pd.to_datetime(df['dropoff_first_eta'], format='%Y/%m/%d %H:%M:%S')
df['dropoff_eta'] = pd.to_datetime(df['dropoff_eta'], format='%Y/%m/%d %H:%M:%S')
df['dropoff_at'] = pd.to_datetime(df['dropoff_at'], format='%Y/%m/%d %H:%M:%S')
df['updated_at'] = pd.to_datetime(df['updated_at'], format='%Y/%m/%d %H:%M:%S')
df['pickup_arrival_time'] = pd.to_datetime(df['pickup_arrival_time'])
df['waiting_time'] = pd.to_datetime(df['waiting_time'], format='%H:%M:%S')
df['boarding_time'] = pd.to_datetime(df['boarding_time'])
df['ride_time'] = pd.to_datetime(df['ride_time'], format='%H:%M:%S')
df['trip_time'] = pd.to_datetime(df['trip_time'], format='%H:%M:%S')
df['delay'] = pd.to_datetime(df['delay'], format='%H:%M:%S')

In [273]:
# check if clean is before the scheduled time  
def check_created_ordering(df):
    # create a list of our conditions
    conditions = [
        df['created_at'] > df['scheduled_to'],
        df['created_at'] > df['dispatched_at'],
        df['created_at'] > df['arriving_push'],
        df['created_at'] > df['vehicle_arrived_at'],
        df['created_at'] > df['earliest_pickup_expectation'],
        df['created_at'] > df['pickup_first_eta'],
        df['created_at'] > df['pickup_eta'],
        df['created_at'] > df['pickup_at'],
        df['created_at'] > df['dropoff_first_eta'],
        df['created_at'] > df['dropoff_eta'],
        df['created_at'] > df['dropoff_at'],
        df['created_at'] > df['updated_at'],
        df['created_at'] > df['canceled_at']
    ]
    
    

    # create a list of the values we want to assign for each condition
    values = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13']
    
    # create error-column 
    df['Errorcode: 0'] = np.select(
        conditions,
        values, 
        '0'
    )
    return df['Errorcode: 0']

# update the dataframe 
df['Errorcode: 0'] = check_created_ordering(df)

In [284]:
# check if scheduled_to calculated correctly 
def check_scheduled_to(df):
    df['Errorcode: 1'] = np.where(
        df['dispatched_at'] + timedelta(minutes=8) != df['scheduled_to'],
        'S1',
        '0'
    )
    return df['Errorcode: 1']

# update the dataframe 
df['Errorcode: 1'] = check_scheduled_to(df)

In [282]:
scheduled_NaN = df[df['Errorcode: 1'] == 'S1']
scheduled_NaN.head()

Unnamed: 0.1,Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,...,cancellation_comment,sheet_name,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,Errorcode: 0,Errorcode: 1
2,2,bb916271-0627-4196-8ec1-5324e0e1f71d,f07028da-ca7e-4713-9e45-743c71712e80,3040,1.0,3.45,1.55,1.55,0.0,VRN,...,,MTD,,,,NaT,,,0,S1
3,3,3cffa0f3-e278-4828-b0a1-f55cb35c1adb,44f61d06-8e79-42c6-9abd-0e85fcaf9d6d,7233,1.0,0.0,5.55,0.0,1.0,STANDARD,...,,MTD,,,,NaT,,,0,S1
4,4,,1a6d2ec4-7e85-4e5b-aed0-1c3693268986,3998,,,,,,STANDARD,...,,MTD,,,,NaT,,,0,S1
5,5,,2c172cdf-dd04-4613-9073-9517adef0138,8109,,,,,,STANDARD,...,,MTD,,,,NaT,,,0,S1
6,6,f797b4d6-2fd4-442a-aec3-32c7f34c9b3e,7b093b19-1bc2-4bc0-b9cb-fcb8cfb8f074,6511,1.0,5.2,3.1,3.1,0.0,VRN,...,,MTD,,,,NaT,,,0,S1


In [None]:
df.to_excel(r'/Users/ericchittka/Downloads/filtered_df.xlsx')