In [1]:
import time
import warnings
from datetime import datetime as dt
from re import M
import git
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
if __name__ == "__main__":
    repo = git.Repo(".", search_parent_directories=True).git.rev_parse(
        "--show-toplevel"
    )
    df = pd.read_csv(f"{repo}/data/rides_combined.csv", index_col=0)
    df_stops = pd.read_excel(
        f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name="MoDstops"
    )


In [4]:
# remove duplicate ids 
duplicate_ids = df[df.duplicated(subset=["id"]) & (df["id"].isna() == False)]["id"]
duplicates = df[df["id"].isin(duplicate_ids)]
duplicates = duplicates.sort_values(["id", "scheduled_to"])
duplicates.reset_index(inplace=True)
df.drop(df[df["id"].isin(duplicate_ids)].index, inplace=True)
for index, row in duplicates.iterrows():
    if pd.notnull(row["scheduled_to"]):
        timestamp_columns = [
            "scheduled_to",
            "dispatched_at",
            "arriving_push",
            "vehicle_arrived_at",
            "earliest_pickup_expectation",
            "pickup_first_eta",
            "pickup_eta",
            "pickup_at",
            "dropoff_first_eta",
            "dropoff_eta",
            "dropoff_at",
        ]
        for col in timestamp_columns:
            if not pd.notnull(row[col]):
                duplicates[col][index] = duplicates[col][index + 1]
    else:
        duplicates.drop(index, inplace=True)

df = df.append(duplicates, ignore_index=True)



In [5]:
# Format-Check
def check_format(df, col_type_dict):
    df_inconsistencies = pd.DataFrame(columns=list(df.columns))
    # check time format in order to avoid errors in cleaning
    for col, col_type in col_type_dict.items():
        if col_type == "timestamp":
            df_inconsistencies_temp = df[
                ~(
                    (
                        df[col].str.match(
                            r"[0-9]{1,4}.[0-9]{1,2}.[0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}"
                        )
                        == True
                    )
                    | (df[col].isna())
                    | (
                        df[col].str.match(
                            r"[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}"
                        )
                        == True
                    )
                )
            ]

            df_inconsistencies = pd.concat(
                [df_inconsistencies, df_inconsistencies_temp], axis=0, ignore_index=True
            )
            df = df[
                (
                    df[col].str.match(
                        r"[0-9]{1,4}.[0-9]{1,2}.[0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}"
                    )
                    == True
                )
                | (df[col].isna())
            ]
        elif col_type == "time":
            df_inconsistencies_temp = df[
                ~(
                    (df[col].str.match(r"[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}") == True)
                    | (df[col].str.contains("1899"))
                    | (df[col].str.contains("1900"))
                    | (df[col].isna())
                )
            ]
            df = df[
                (df[col].str.match(r"[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}") == True)
                | (
                    df[col].str.match(r"[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}.[0-9]*")
                    == True
                )
                | (df[col].str.contains("1899"))
                | (df[col].str.contains("1900"))
                | (df[col].isna())
            ]
            df_inconsistencies = pd.concat(
                [df_inconsistencies, df_inconsistencies_temp], axis=0, ignore_index=True
            )

        elif col_type == "numerical":
            df_inconsistencies_temp = df[
                ~(
                    df[col].astype(str).str.replace(".", "").str.isdigit()
                    | (df[col].isna())
                )
            ]
            df = df[
                df[col].astype(str).str.replace(".", "").str.isdigit() | df[col].isna()
            ]
            df[col] = df[col].astype(float)
            df_inconsistencies = pd.concat(
                [df_inconsistencies, df_inconsistencies_temp], axis=0, ignore_index=True
            )

    return (df, df_inconsistencies)


In [6]:
columns = {
        "distance": "numerical",
        "number_of_passenger": "numerical",
        "created_at": "timestamp",
        "scheduled_to": "timestamp",
        "dispatched_at": "timestamp",
        "pickup_arrival_time": "time",
        "arriving_push": "timestamp",
        "vehicle_arrived_at": "timestamp",
        "earliest_pickup_expectation": "timestamp",
        "pickup_first_eta": "timestamp",
        "pickup_eta": "timestamp",
        "pickup_at": "timestamp",
        "dropoff_first_eta": "timestamp",
        "dropoff_eta": "timestamp",
        "dropoff_at": "timestamp",
        "waiting_time": "time",
        "boarding_time": "time",
        "ride_time": "time",
        "trip_time": "time",
        "shortest_ridetime": "time",
        "delay": "time",
    }

In [7]:
df, df_inconsistencies = check_format(df, columns)
if df_inconsistencies.empty == False:
    df_inconsistencies.to_excel(
        f"{repo}/data/cleaning/inconsistencies_{int(time.time())}.xlsx"
    )

In [8]:
# clean free ride 
free_ride = np.where(df["free_ride"] == 1, True, False)

In [9]:
# clen ride id 
id = pd.DataFrame(data=df.loc[:, "id"], columns=["id"])
id.id.fillna(df.created_from_offer.astype("Int64"), inplace=True)

In [10]:
# clean distance where pickup_address == dropoff_address
df = df[df["pickup_address"] != df["dropoff_address"]]

In [11]:
# Attributes: ['pickup_address', 'dropoff_address']
def get_stop_id(address, df_stops):
    if address[0].isdigit():
        lat = address.split("|")[0]
        long = address.split("|")[1]
        for index, row in df_stops.iterrows():
            if str(row["MoDStop Lat"]) == lat and str(row["MoDStop Long"]) == long:
                return row["MoDStop Id"]
        return 0
    else:
        # fix different namings between MoDStop table and rides table
        if address == "Rewe Mußbach":
            address = address + " (Shoppenwiese)"
        elif address == "Lachener Straße":
            address = "Laachener Straße"
        for index, row in df_stops.iterrows():
            if row["MoDStop Name"] == address:
                return row["MoDStop Id"]
            elif address == "Würzmühle":
                return 11009
        return 0

In [12]:
def clean_addresses(df, df_stops):
    addresses = pd.DataFrame(
    data=df.loc[:, ["pickup_address", "dropoff_address"]],
    columns=["pickup_address", "dropoff_address"],
    )
    addresses[["pickup_id", "dropoff_id"]] = ""
    for index, row in addresses.iterrows():
        addresses.at[index, "pickup_id"] = get_stop_id(row["pickup_address"], df_stops)
        addresses.at[index, "dropoff_id"] = get_stop_id(
            row["dropoff_address"], df_stops
        )
    # export list of unmatched addresses
    repo = git.Repo(".", search_parent_directories=True).git.rev_parse(
        "--show-toplevel"
    )
    file = f"{repo}/data/cleaning/unmatched_addresses_{int(time.time())}.xlsx"
    mask = (addresses["pickup_id"] == 0) | (addresses["dropoff_id"] == 0)
    df[mask].to_excel(file)
    addresses.drop(columns=["pickup_address", "dropoff_address"], axis=1, inplace=True)

    return addresses 

In [13]:
df[["pickup_address", "dropoff_address"]] = clean_addresses(df, df_stops)

In [14]:
# clean created at
df['created_at'] = pd.to_datetime(df["created_at"])

In [15]:
# clean scheduled_to 
df['scheduled_to'] = pd.to_datetime(df["scheduled_to"])
df['scheduled_to'] = df['scheduled_to'].fillna(df["created_at"])

# filter that scheduled_to is not before created_at
df['scheduled_to'] = np.where(
    df['scheduled_to'] < df["created_at"], df["created_at"], df['scheduled_to']
)
df['scheduled_to'] = pd.to_datetime(df['scheduled_to'])

In [16]:
# clean dispatched at
# Cast to correct dtype
df['dispatched_at'] = pd.to_datetime(df["dispatched_at"])
df['scheduled_to'] = pd.to_datetime(df['scheduled_to'])
# Fill values of dispatched_at which are completed and scheduled rides with scheduled-8 Min else with created_at 
df['dispatched_at'] = np.where(
    (df["state"] == "completed") & (df['scheduled_to'] != df['created_at']),
    # Clear cases where scheduled_to - 8Min is smaller than created_at else dispatched_at would be smaller than created_at
    np.where(
        (df['scheduled_to'] - pd.Timedelta(minutes=8) < df['created_at']),
        df['created_at'],
        df['scheduled_to'] - pd.Timedelta(minutes=8),
    ),
    np.where(
        (df["state"] == "completed") & (df['scheduled_to'] == df['created_at']),
        df['created_at'],
        df['dispatched_at']
    )
)
df['dispatched_at'] = pd.to_datetime(df['dispatched_at'])

In [17]:
# get the average pickup arrival time 
times = [3600, 60, 1]
df['pickup_arrival_time'] = pd.to_datetime(df.pickup_arrival_time)
# get all values in one format
df['pickup_arrival_time'] = df['pickup_arrival_time'].dt.strftime('%H:%M:%S')
# replace all values with -9 if pickup_arrival_time is NaN or if it is bigger than 3 hours (assumption)
df['pickup_arrival_time'] = np.where(
    (pd.to_timedelta(df['pickup_arrival_time']) > pd.Timedelta(hours=3)) | (df['pickup_arrival_time'].isna()),
    "-9",
    df['pickup_arrival_time'],
)
df['pickup_arrival_time'] = df['pickup_arrival_time'].str[0:8].apply(
    lambda row: sum(
        [a * b for a, b in zip(times, map(int, row.split(":"))) if len(row) == 8]
    )
)
avg_pickup_arrival_time = sum(x for x in df['pickup_arrival_time'] if x != -9) / len(
    list(x for x in df['pickup_arrival_time'] if x != -9)
)
avg_pickup_arrival_time = round(avg_pickup_arrival_time)

In [18]:
# clean vehicle_arrived_at 
df["arriving_push"] = pd.to_datetime(df["arriving_push"])
vehicle_arrived_at = pd.to_datetime(df['vehicle_arrived_at'])
df["pickup_at"] = pd.to_datetime(df["pickup_at"])
# fill the NaN values with dispatched_at plus the average pickup arrival time since pickup_arrival_time = vehicle_arrivd_at - dispatched_at
vehicle_arrived_at = np.where(
    (vehicle_arrived_at.isna()) & (df["state"] == "completed"),
    # only if dispatched_at + average pickup time is smaller than pickup_at we add the average time to dispatched_at else we take the pickup_at 
    np.where(
            (
                df["dispatched_at"] + pd.Timedelta(seconds=avg_pickup_arrival_time)
                < df["pickup_at"]
            )
            | (df["pickup_at"].isna() == True),
            df["dispatched_at"] + pd.Timedelta(seconds=avg_pickup_arrival_time),
            df["pickup_at"],
        ),
        vehicle_arrived_at,
    )
vehicle_arrived_at = pd.to_datetime(vehicle_arrived_at)

# vehicle_arrived_at must take place on the same date as scheduled_to
vehicle_arrived_at = np.where(
    vehicle_arrived_at - df['scheduled_to'] > pd.Timedelta(days=1),
    df["dispatched_at"] + pd.Timedelta(seconds=avg_pickup_arrival_time),
    # assumption that vehicle arrives in at least 1 hour from the actual schedule time 
    np.where(
    (vehicle_arrived_at < df["arriving_push"])
    | (vehicle_arrived_at + pd.Timedelta(minutes=60) < df["scheduled_to"])
    | (vehicle_arrived_at - pd.Timedelta(minutes=60) > df["scheduled_to"])
    | (vehicle_arrived_at < df['dispatched_at']),
        np.where(
            (df["arriving_push"].isna()) | (df["arriving_push"] < df['dispatched_at']),
            np.where(
                (
                    df["dispatched_at"] + pd.Timedelta(seconds=avg_pickup_arrival_time)
                    < df["pickup_at"]
                )
                | (df["pickup_at"].isna() == True)
                | (df['pickup_at'] < df['dispatched_at']),
                df["dispatched_at"] + pd.Timedelta(seconds=avg_pickup_arrival_time),
                df["pickup_at"],
            ),
            # arriving push is the assumption from the system that the pickup will be arrived in less than 3 minutes  
            np.where(
                ((df["arriving_push"] + pd.Timedelta(minutes=3)) < df["pickup_at"]),
                df["arriving_push"] + pd.Timedelta(minutes=3),
                df["arriving_push"],
        ),
        ),
        vehicle_arrived_at,
    )
)
vehicle_arrived_at = pd.to_datetime(vehicle_arrived_at)
df['vehicle_arrived_at'] = vehicle_arrived_at.floor("s")

In [19]:
df.loc[(df.vehicle_arrived_at < df.dispatched_at) & (df.state == 'completed')]

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,dropoff_address,state,created_from_offer,created_at,scheduled_to,dispatched_at,pickup_arrival_time,arriving_push,vehicle_arrived_at,earliest_pickup_expectation,pickup_first_eta,pickup_eta,pickup_at,dropoff_first_eta,dropoff_eta,dropoff_at,updated_at,arrival_deviation,waiting_time,boarding_time,ride_time,trip_time,shortest_ridetime,delay,longer_route_factor,arrival_indicator,rating,rating_puenktlichkeit,rating_sauberkeit,rating_fahrer,rating_find_modstop,rating_other_comments,cancellation_reason,cancellation_comment,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index


In [20]:
# Attribute: 'arriving_push'
df["arriving_push"] = pd.to_datetime(df["arriving_push"])
arriving_push = df["arriving_push"].fillna(
    df["vehicle_arrived_at"] - pd.Timedelta(minutes=3)
)
# Check ordering 
arriving_push = np.where(
    # check if it is not too far away from scheduled_to 
    arriving_push - df['scheduled_to'] > pd.Timedelta(days=0.8),
    df["vehicle_arrived_at"] - pd.Timedelta(minutes=3),
    arriving_push
)
arriving_push
df['arriving_push'] = pd.to_datetime(arriving_push)

In [21]:
# Attribute: 'earliest_pickup_expectation'
earlierst_pickup_expectation = pd.to_datetime(df["earliest_pickup_expectation"])
# earliest pickup expectation is defined as dispatched + 3 Minuten
earlierst_pickup_expectation = np.where(
    # case that it is not a scheduled ride or that scheduled - 8Min < created_at
    (df["scheduled_to"] == df["created_at"]) | (df['scheduled_to'] - pd.Timedelta(minutes=8) < df['created_at']),
    df["dispatched_at"] + pd.Timedelta(minutes=3),
    # case that it is a scheduled ride 
    df["scheduled_to"] - pd.Timedelta(minutes=5)
)
# Check ordering 
earlierst_pickup_expectation = np.where(
    earlierst_pickup_expectation - df['scheduled_to'] > pd.Timedelta(days=1),
    df["vehicle_arrived_at"] - pd.Timedelta(minutes=3),
    earlierst_pickup_expectation
)
df["earliest_pickup_expectation"] = pd.to_datetime(earlierst_pickup_expectation)

In [22]:
df.loc[(df.earliest_pickup_expectation - pd.Timedelta(minutes=3) > (df.dispatched_at)) & (df.state == 'completed')]

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,dropoff_address,state,created_from_offer,created_at,scheduled_to,dispatched_at,pickup_arrival_time,arriving_push,vehicle_arrived_at,earliest_pickup_expectation,pickup_first_eta,pickup_eta,pickup_at,dropoff_first_eta,dropoff_eta,dropoff_at,updated_at,arrival_deviation,waiting_time,boarding_time,ride_time,trip_time,shortest_ridetime,delay,longer_route_factor,arrival_indicator,rating,rating_puenktlichkeit,rating_sauberkeit,rating_fahrer,rating_find_modstop,rating_other_comments,cancellation_reason,cancellation_comment,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index


In [23]:
pickup_at = pd.to_datetime(df["pickup_at"])
pickup_eta = pd.to_datetime(df["pickup_eta"])
# calculate the average boarding time because boarding_time = pickup_at - vehicle_arrived_at
boarding_time = pd.Series(
    np.where(
        df["vehicle_arrived_at"] < pickup_at,
        (pickup_at - df["vehicle_arrived_at"]).dt.seconds,
        -9,
    )
)
boarding_time = boarding_time.fillna(-9)

avg_boarding_time = sum(x for x in boarding_time if x != -9) / len(
    list(x for x in boarding_time if x != -9)
)
avg_boarding_time = round(avg_boarding_time)

# fill NaN values
pickup_at = np.where(
    (pickup_at.isna()) & (df["state"] == "completed"),
    # if pickup_eta is Nan or pickup_eta is too far away from scheduled_to than fill the values with vehicle_arrived_at + avg boarding time else put pickup_eta as value 
    np.where(
        (df["pickup_eta"].isna()) | (pickup_eta - df['scheduled_to'] >= pd.Timedelta(days=1)),
        df["vehicle_arrived_at"] + pd.Timedelta(seconds=avg_boarding_time),
        df["pickup_eta"],
        ),
    pickup_at,
)
pickup_at = pd.to_datetime(pickup_at)

# Check ordering
pickup_at = np.where(
    # pickup_at must be after or at the same time than vehicle_arrived_at
    # pickup_at can not be far away from scheduled_to
    (pickup_at < df["vehicle_arrived_at"]) | (pickup_at - df['scheduled_to'] > pd.Timedelta(days=1)),
    np.where(
        (df["pickup_eta"].isna()) | (df["pickup_eta"] < df["vehicle_arrived_at"]) | (pickup_at - df['scheduled_to'] >= pd.Timedelta(days=1)),
        np.where(
            (
                df["vehicle_arrived_at"] + pd.Timedelta(seconds=avg_boarding_time)
                < df["dropoff_at"]
            )
            | (df["dropoff_at"].isna()),
            df["vehicle_arrived_at"] + pd.Timedelta(seconds=avg_boarding_time),
            df["vehicle_arrived_at"],
        ),
        df["pickup_eta"],
    ),
    pickup_at,
)

df['pickup_at'] = pd.to_datetime(pickup_at)

In [24]:
# keine vorhanden 
df.loc[(df.pickup_at < df.vehicle_arrived_at) & (df.state == 'completed')] 
# bei den zwei gab es einen Tageswechsel deswegen okay
df.loc[(df.vehicle_arrived_at.dt.day != df.pickup_at.dt.day) & (df.state == 'completed')]

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,dropoff_address,state,created_from_offer,created_at,scheduled_to,dispatched_at,pickup_arrival_time,arriving_push,vehicle_arrived_at,earliest_pickup_expectation,pickup_first_eta,pickup_eta,pickup_at,dropoff_first_eta,dropoff_eta,dropoff_at,updated_at,arrival_deviation,waiting_time,boarding_time,ride_time,trip_time,shortest_ridetime,delay,longer_route_factor,arrival_indicator,rating,rating_puenktlichkeit,rating_sauberkeit,rating_fahrer,rating_find_modstop,rating_other_comments,cancellation_reason,cancellation_comment,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index
2871,87131775-38f2-4eb1-8986-3c2e4296f61c,b204438a-ca02-4a7b-8251-c4ad74f917d4,6150.0,2.0,8.1,8.1,8.1,0.0,STANDARD,1005,5006,completed,,2021-11-12 23:11:07,2021-11-12 23:50:00,2021-11-12 23:42:00,1066,2021-11-12 23:56:45,2021-11-12 23:59:46,2021-11-12 23:45:00,2021-11-12 23:57:18,2021-11-13 00:00:49,2021-11-13 00:00:02,2021-11-13 00:06:53,2021-11-13 00:10:28,2021-11-13 00:28:40,2021-11-15 11:24:38,1.0,00:14:46,00:00:16,00:28:38,00:43:24,00:12:18,00:31:06,2.33,,,,,,,,,,,,,,,,
6279,ab32bf94-da46-483d-b183-cd282bee8792,70c71e66-7210-48e5-9aa1-9098065659ea,2635.0,3.0,8.31,8.31,8.31,0.0,STANDARD,9009,3010,completed,,2022-03-05 22:53:58,2022-03-05 23:30:00,2022-03-05 23:22:00,2238,2022-03-05 23:52:20,2022-03-05 23:59:18,2022-03-05 23:25:00,2022-03-05 23:58:31,2022-03-06 00:04:28,2022-03-06 00:00:44,2022-03-06 00:04:18,2022-03-06 00:09:59,2022-03-06 00:08:18,2022-03-06 00:08:20,238.0,00:34:18,00:01:26,00:07:34,00:41:52,00:05:16.200000,00:36:36,1.44,,,,,,,,,,,,,,,,


In [25]:
# Attribute: 'pickup_eta'
pickup_eta = pd.to_datetime(df["pickup_eta"])

pickup_eta = pickup_eta.fillna(df["pickup_at"])

# Check ordering
pickup_eta = np.where(
    (pickup_eta < df['dispatched_at']) | (pickup_eta - df['scheduled_to'] > pd.Timedelta(days=1)),
    df['pickup_at'],
    pickup_eta
)
df['pickup_eta'] = pd.to_datetime(pickup_eta)

In [26]:
df.loc[(df.pickup_eta.dt.day - df.pickup_at.dt.day >= 1) & (df.state == 'completed')]

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,dropoff_address,state,created_from_offer,created_at,scheduled_to,dispatched_at,pickup_arrival_time,arriving_push,vehicle_arrived_at,earliest_pickup_expectation,pickup_first_eta,pickup_eta,pickup_at,dropoff_first_eta,dropoff_eta,dropoff_at,updated_at,arrival_deviation,waiting_time,boarding_time,ride_time,trip_time,shortest_ridetime,delay,longer_route_factor,arrival_indicator,rating,rating_puenktlichkeit,rating_sauberkeit,rating_fahrer,rating_find_modstop,rating_other_comments,cancellation_reason,cancellation_comment,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index
7222,de002fb9-4c44-46ef-8c1b-376ca3e372bd,7921da2a-0e07-466b-8a3e-2833d147c8ce,8736.0,2.0,12.23,12.23,12.23,0.0,STANDARD,2002,14003,completed,,2022-03-18 22:51:05,2022-03-18 23:40:00,2022-03-18 23:32:00,1466,2022-03-18 23:52:33,2022-03-18 23:56:26,2022-03-18 23:35:00,2022-03-18 23:49:06,2022-03-19 00:00:20,2022-03-18 23:56:27,2022-03-19 00:07:28,2022-03-19 00:15:22,2022-03-19 00:11:43,2022-03-19 00:11:45,53.0,00:21:26,00:00:01,00:15:16,00:36:42,00:17:28.320000,00:19:14,0.87,,,,,,,,,,,,,,,,
7853,066dec2d-33fc-4960-95cd-f750fa6f29bb,46559279-5d2f-4862-9611-937b1ebcd7c8,3145.0,4.0,12.19,9.59,9.59,0.0,VRN,15013,8001,completed,,2022-03-26 23:20:04,2022-03-26 23:50:00,2022-03-26 23:42:00,1000,2022-03-26 23:57:13,2022-03-26 23:58:40,2022-03-26 23:45:00,2022-03-26 23:57:55,2022-03-27 00:00:27,2022-03-26 23:59:58,2022-03-27 00:04:12,2022-03-27 00:06:50,2022-03-27 00:07:45,2022-03-27 06:19:32,-93.0,00:13:40,00:01:18,00:07:47,00:21:27,00:06:17.400000,00:15:10,1.24,,5.0,,,,,,,,7081411188154906.0,Maax Ticket,536582001.0,,,Mein Auto ist nicht verfügbar,
8997,ab3fd7d4-ba70-45a3-8aee-a9671094150e,33821735-a982-4e60-9a16-1ecfaaf04f7e,6563.0,2.0,9.94,9.94,9.94,0.0,STANDARD,4041,15010,completed,,2022-02-11 23:16:37,2022-02-11 23:50:00,2022-02-11 23:42:00,1055,2022-02-11 23:56:08,2022-02-11 23:59:35,2022-02-11 23:45:00,2022-02-11 23:49:14,2022-02-12 00:00:34,2022-02-11 23:59:38,2022-02-12 00:00:39,2022-02-12 00:11:51,2022-02-12 00:08:51,2022-02-12 00:10:14,27.0,00:14:35,00:00:03,00:09:13,00:23:48,00:13:07.560000,00:10:40,0.7,,,,,,,,,,,,,,,,
9987,2c7d129b-aa03-49b1-b201-4bc8dc4c7e79,7ce10780-2608-4af5-a850-9618650acfde,1969.0,1.0,3.1,3.1,3.1,0.0,STANDARD,15008,15014,completed,,2022-02-25 21:39:35,2022-02-25 23:50:00,2022-02-25 23:42:00,834,2022-02-25 23:43:49,2022-02-25 23:55:54,2022-02-25 23:45:00,2022-02-25 23:47:43,2022-02-26 00:03:11,2022-02-25 23:55:56,2022-02-25 23:53:22,2022-02-26 00:08:56,2022-02-25 23:56:04,2022-02-25 23:56:05,545.0,00:10:54,00:00:02,00:00:08,00:11:02,00:03:56.280000,00:07:06,0.03,,,,,,,,,,,,,,,,
14833,bd046765-0d9d-433f-af58-b2fde3488a6f,1a6b4e7f-6766-4d39-beef-9df24e641128,4296.0,2.0,7.53,5.13,5.13,0.0,VRN,15002,1009,completed,16522.0,2022-05-21 23:42:47,2022-05-21 23:42:47,2022-05-21 23:42:47,979,2022-05-21 23:54:59,2022-05-21 23:59:06,2022-05-21 23:45:47,2022-05-21 23:58:49,2022-05-22 00:00:06,2022-05-21 23:59:08,2022-05-22 00:17:21,2022-05-22 00:18:45,2022-05-22 00:20:58,2022-05-22 00:20:59,67.0,00:13:19,00:00:02,00:21:50,00:35:09,00:08:35.520000,00:26:33,2.54,Kein Signalschild,,,,,,,,,,,,,,,
18706,f5df0493-0110-4dde-8a73-ac7f87b519f7,41c95331-14bc-4433-a0ce-46017b56aec2,5947.0,4.0,17.23,17.23,17.23,0.0,STANDARD,4038,16027,completed,,2022-04-29 18:42:33,2022-04-29 23:50:00,2022-04-29 23:42:00,1017,2022-04-29 23:54:40,2022-04-29 23:58:57,2022-04-29 23:45:00,2022-04-29 23:56:12,2022-04-30 00:00:05,2022-04-29 23:59:41,2022-04-30 00:04:51,2022-04-30 00:08:26,2022-04-30 00:11:09,2022-04-30 00:12:34,77.0,00:13:57,00:00:44,00:11:28,00:25:25,00:11:53.640000,00:13:31,0.96,,,,,,,,,,,,,,,,


In [27]:
# Attribute: 'pickup_first_eta'
pickup_first_eta = pd.to_datetime(df["pickup_first_eta"])

pickup_first_eta = pickup_first_eta.fillna(df["pickup_eta"])

# Check ordering
pickup_first_eta = np.where(
    (pickup_first_eta < df['dispatched_at']) | (pickup_first_eta - df['scheduled_to'] > pd.Timedelta(days=1)),
    df['pickup_eta'],
    pickup_first_eta
)
df['pickup_first_eta'] = pd.to_datetime(pickup_first_eta)

In [28]:
# Attribute: 'dropoff_at'
df['dropoff_at'] = pd.to_datetime(df["dropoff_at"])
dropoff_eta = pd.to_datetime(df["dropoff_eta"])
ftr = [3600, 60, 1]
shortest_ridetime = (
    df["shortest_ridetime"]
    .str[0:8]
    .apply(lambda row: sum([a * b for a, b in zip(ftr, map(int, row.split(":")))]))
)

df['dropoff_at'] = np.where(
    (df['dropoff_at'].isna()) & (df["state"] == "completed"),
    np.where(
        (df["dropoff_eta"].isna()) | (dropoff_eta - df['scheduled_to'] >= pd.Timedelta(days=1)),
        df['dropoff_at'] + pd.to_timedelta(shortest_ridetime, unit="s"), 
        df["dropoff_eta"],
    ),
    df['dropoff_at'],
)
df['dropoff_at'] = pd.to_datetime(df['dropoff_at'])

# Check ordering
df['dropoff_at'] = np.where(
    (df['dropoff_at'] <= df["pickup_at"]) | (df['dropoff_at'] - df['scheduled_to'] > pd.Timedelta(days=1)),
    df['pickup_at'] + pd.to_timedelta(shortest_ridetime, unit="s"),
    df["dropoff_at"],
)

df['dropoff_at'] = pd.to_datetime(df['dropoff_at'])

In [29]:
df.loc[(df['dropoff_at'] - df['scheduled_to'] > pd.Timedelta(days=1)) & (df.state == 'completed')]

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,dropoff_address,state,created_from_offer,created_at,scheduled_to,dispatched_at,pickup_arrival_time,arriving_push,vehicle_arrived_at,earliest_pickup_expectation,pickup_first_eta,pickup_eta,pickup_at,dropoff_first_eta,dropoff_eta,dropoff_at,updated_at,arrival_deviation,waiting_time,boarding_time,ride_time,trip_time,shortest_ridetime,delay,longer_route_factor,arrival_indicator,rating,rating_puenktlichkeit,rating_sauberkeit,rating_fahrer,rating_find_modstop,rating_other_comments,cancellation_reason,cancellation_comment,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index


In [30]:
df.loc[(df['pickup_at'] > df['dropoff_at']) & (df.state == 'completed')]

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,dropoff_address,state,created_from_offer,created_at,scheduled_to,dispatched_at,pickup_arrival_time,arriving_push,vehicle_arrived_at,earliest_pickup_expectation,pickup_first_eta,pickup_eta,pickup_at,dropoff_first_eta,dropoff_eta,dropoff_at,updated_at,arrival_deviation,waiting_time,boarding_time,ride_time,trip_time,shortest_ridetime,delay,longer_route_factor,arrival_indicator,rating,rating_puenktlichkeit,rating_sauberkeit,rating_fahrer,rating_find_modstop,rating_other_comments,cancellation_reason,cancellation_comment,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index


In [31]:
# Tageswechsel
df.loc[(df.dropoff_at.dt.day - df.pickup_at.dt.day >= 1) & (df.state == 'completed')]

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,dropoff_address,state,created_from_offer,created_at,scheduled_to,dispatched_at,pickup_arrival_time,arriving_push,vehicle_arrived_at,earliest_pickup_expectation,pickup_first_eta,pickup_eta,pickup_at,dropoff_first_eta,dropoff_eta,dropoff_at,updated_at,arrival_deviation,waiting_time,boarding_time,ride_time,trip_time,shortest_ridetime,delay,longer_route_factor,arrival_indicator,rating,rating_puenktlichkeit,rating_sauberkeit,rating_fahrer,rating_find_modstop,rating_other_comments,cancellation_reason,cancellation_comment,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index
2118,53a70562-72f3-4382-bcb4-a704f1af5b92,0e82a6ae-7d07-45b2-9c0c-10a03dfa37ff,5101.0,1.0,4.86,4.33,4.33,0.0,BAHN_CARD,1005,4013,completed,,2022-01-28 22:50:23,2022-01-28 23:50:00,2022-01-28 23:42:00,643,2022-01-28 23:49:35,2022-01-28 23:52:43,2022-01-28 23:45:00,2022-01-28 23:55:34,2022-01-28 23:54:04,2022-01-28 23:52:45,2022-01-29 00:04:58,2022-01-29 00:13:46,2022-01-29 00:07:47,2022-01-29 00:07:49,8.0,00:07:43,00:00:02,00:15:02,00:22:45,00:10:12.120000,00:12:33,1.47,,,,,,,,,,,,,,,,
2119,4063b52e-0697-47ae-9539-801f78ff308d,b204438a-ca02-4a7b-8251-c4ad74f917d4,6152.0,1.0,5.41,5.41,5.41,0.0,STANDARD,1005,5006,completed,,2022-01-28 22:48:41,2022-01-28 23:50:00,2022-01-28 23:42:00,655,2022-01-28 23:50:41,2022-01-28 23:52:55,2022-01-28 23:45:00,2022-01-28 23:56:31,2022-01-28 23:54:16,2022-01-28 23:52:59,2022-01-29 00:07:51,2022-01-29 00:05:34,2022-01-29 00:01:26,2022-01-29 00:01:27,-46.0,00:07:55,00:00:04,00:08:27,00:16:22,00:12:18.240000,00:04:04,0.69,,,,,,,,,,,,,,,,
3360,ddc83291-a782-41c3-b655-bf2254354119,7921da2a-0e07-466b-8a3e-2833d147c8ce,3913.0,3.0,7.9,7.9,7.9,0.0,STANDARD,15013,14003,completed,,2021-11-20 23:16:25,2021-11-20 23:50:00,2021-11-20 23:42:00,773,2021-11-20 23:53:53,2021-11-20 23:54:53,2021-11-20 23:45:00,2021-11-20 23:54:51,2021-11-20 23:55:54,2021-11-20 23:55:51,2021-11-21 00:01:16,2021-11-21 00:02:46,2021-11-21 00:00:58,2021-11-21 00:26:15,-120.0,00:09:53,00:00:58,00:05:07,00:15:00,00:07:49.560000,00:07:10,0.65,,5.0,,,,,,,,,,,,,,
3688,87d1e70e-379a-4d6e-be73-de99294c95f7,24a96def-50d6-4267-8377-da7540a59627,5414.0,1.0,4.65,4.65,4.65,0.0,STANDARD,1007,5002,completed,,2021-11-27 21:42:03,2021-11-27 23:40:00,2021-11-27 23:32:00,1061,2021-11-27 23:47:04,2021-11-27 23:49:41,2021-11-27 23:35:00,2021-11-27 23:51:20,2021-11-27 23:50:42,2021-11-27 23:49:45,2021-11-27 23:59:54,2021-11-27 23:59:56,2021-11-28 00:02:26,2021-11-28 00:02:29,-23.0,00:14:41,00:00:04,00:12:41,00:27:22,00:10:49.680000,00:16:32,1.17,,,,,,,,,,,,,,,,
6759,dd332976-deaa-4235-a327-27413b87ed63,6a4736a2-09af-438b-b25d-62ac941dc27d,4681.0,1.0,4.59,2.19,2.19,0.0,VRN,3020,15014,completed,,2022-03-12 18:12:46,2022-03-12 23:50:00,2022-03-12 23:42:00,547,2022-03-12 23:45:32,2022-03-12 23:51:07,2022-03-12 23:45:00,2022-03-12 23:48:02,2022-03-12 23:52:23,2022-03-12 23:51:09,2022-03-12 23:57:19,2022-03-13 00:01:36,2022-03-13 00:01:16,2022-03-13 00:01:18,155.0,00:06:07,00:00:02,00:10:07,00:16:14,00:09:21.720000,00:06:52,1.08,,,,,,,,,,,Job-Ticket,400289417.0,,,,
6760,8a5c8afb-57b2-40ab-a105-868e2368a7e4,7ce10780-2608-4af5-a850-9618650acfde,4681.0,1.0,4.59,4.59,4.59,0.0,STANDARD,3020,15014,completed,,2022-03-12 18:50:11,2022-03-12 23:50:00,2022-03-12 23:42:00,543,2022-03-12 23:42:55,2022-03-12 23:51:03,2022-03-12 23:45:00,2022-03-12 23:46:37,2022-03-12 23:52:19,2022-03-12 23:51:05,2022-03-12 23:55:54,2022-03-13 00:02:47,2022-03-13 00:01:09,2022-03-13 00:01:10,308.0,00:06:03,00:00:02,00:10:04,00:16:07,00:09:21.720000,00:06:45,1.08,,,,,,,,,,,,,,,,
7222,de002fb9-4c44-46ef-8c1b-376ca3e372bd,7921da2a-0e07-466b-8a3e-2833d147c8ce,8736.0,2.0,12.23,12.23,12.23,0.0,STANDARD,2002,14003,completed,,2022-03-18 22:51:05,2022-03-18 23:40:00,2022-03-18 23:32:00,1466,2022-03-18 23:52:33,2022-03-18 23:56:26,2022-03-18 23:35:00,2022-03-18 23:49:06,2022-03-19 00:00:20,2022-03-18 23:56:27,2022-03-19 00:07:28,2022-03-19 00:15:22,2022-03-19 00:11:43,2022-03-19 00:11:45,53.0,00:21:26,00:00:01,00:15:16,00:36:42,00:17:28.320000,00:19:14,0.87,,,,,,,,,,,,,,,,
7224,e8520b00-0527-4054-9925-22a772ba4f9a,d89f854a-be6b-4c5c-8d0f-953122ad23e4,7052.0,2.0,10.36,10.36,10.36,0.0,STANDARD,4013,13002,completed,,2022-03-18 20:23:26,2022-03-18 23:50:00,2022-03-18 23:42:00,880,2022-03-18 23:53:24,2022-03-18 23:56:40,2022-03-18 23:45:00,2022-03-18 23:56:01,2022-03-18 23:57:40,2022-03-18 23:56:44,2022-03-19 00:11:54,2022-03-19 00:21:32,2022-03-19 00:17:33,2022-03-19 00:17:35,16.0,00:11:40,00:00:04,00:20:49,00:32:29,00:14:06.240000,00:18:23,1.48,,,,,,,,,,,,,,,,
7720,962e6a61-c755-4fbb-a1a3-2d81925f219e,34909629-44b7-4fbd-a6eb-1f7158aabb5d,2304.0,1.0,3.32,3.32,3.32,0.0,STANDARD,9001,1004,completed,12452.0,2022-03-25 23:49:45,2022-03-25 23:49:45,2022-03-25 23:49:45,465,2022-03-25 23:54:14,2022-03-25 23:57:30,2022-03-25 23:52:45,2022-03-25 23:57:31,2022-03-25 23:58:39,2022-03-25 23:57:57,2022-03-26 00:03:02,2022-03-26 00:04:44,2022-03-26 00:02:45,2022-03-26 00:03:32,16.0,00:04:45,00:00:27,00:04:48,00:09:33,00:04:36.480000,00:04:57,1.04,Kein Signalschild,,,,,,,,,,,,,,,
7850,83cc534a-124c-49ec-8050-830c63825221,0a370a67-8b77-46da-b4a4-b07b78f4ef3c,3145.0,4.0,12.19,11.02,11.02,0.0,BAHN_CARD,15013,8001,completed,12548.0,2022-03-26 23:24:56,2022-03-26 23:24:56,2022-03-26 23:24:56,1319,2022-03-26 23:44:11,2022-03-26 23:46:55,2022-03-26 23:27:56,2022-03-26 23:38:18,2022-03-26 23:55:09,2022-03-26 23:54:56,2022-03-26 23:44:34,2022-03-27 00:01:32,2022-03-27 00:00:36,2022-03-27 00:01:20,-16.0,00:18:59,00:08:01,00:05:40,00:24:39,00:06:17.400000,00:18:22,0.9,Kein Signalschild,,,,,,,,,7081411107919504.0,,,,,,


In [32]:
# Attribute: 'dropoff_eta'
dropoff_eta = pd.to_datetime(df["dropoff_eta"])

dropoff_eta = dropoff_eta.fillna(df["dropoff_at"])

# Check ordering
dropoff_eta = np.where(
    (dropoff_eta < df['dispatched_at']) | (dropoff_eta - df['scheduled_to'] > pd.Timedelta(days=1)),
    df['dropoff_at'],
    dropoff_eta
)
df['dropoff_eta'] = pd.to_datetime(dropoff_eta)

In [33]:
df.loc[(df['dropoff_eta'] - df['scheduled_to'] > pd.Timedelta(days=1)) & (df.state == 'completed')] 

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,dropoff_address,state,created_from_offer,created_at,scheduled_to,dispatched_at,pickup_arrival_time,arriving_push,vehicle_arrived_at,earliest_pickup_expectation,pickup_first_eta,pickup_eta,pickup_at,dropoff_first_eta,dropoff_eta,dropoff_at,updated_at,arrival_deviation,waiting_time,boarding_time,ride_time,trip_time,shortest_ridetime,delay,longer_route_factor,arrival_indicator,rating,rating_puenktlichkeit,rating_sauberkeit,rating_fahrer,rating_find_modstop,rating_other_comments,cancellation_reason,cancellation_comment,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index


In [34]:
# Attribute: 'dropoff_first_eta'
dropoff_first_eta = pd.to_datetime(df["dropoff_first_eta"])
ftr = [3600, 60, 1]
shortest_ridetime = (
    df["shortest_ridetime"]
    .str[0:8]
    .apply(lambda row: sum([a * b for a, b in zip(ftr, map(int, row.split(":")))]))
)
dropoff_first_eta = dropoff_first_eta.fillna(
    df["pickup_first_eta"] + pd.to_timedelta(shortest_ridetime, unit="s")
)

# Check ordering
dropoff_first_eta = np.where(
    (dropoff_first_eta < df['dispatched_at']) | (dropoff_first_eta - df['scheduled_to'] > pd.Timedelta(days=1)),
    df["pickup_first_eta"] + pd.to_timedelta(shortest_ridetime, unit="s"),
    dropoff_first_eta
)
df['dropoff_first_eta'] = pd.to_datetime(dropoff_first_eta)

In [35]:
df.loc[(df.dropoff_first_eta < df['dispatched_at']) & (df.state == 'completed')]

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,dropoff_address,state,created_from_offer,created_at,scheduled_to,dispatched_at,pickup_arrival_time,arriving_push,vehicle_arrived_at,earliest_pickup_expectation,pickup_first_eta,pickup_eta,pickup_at,dropoff_first_eta,dropoff_eta,dropoff_at,updated_at,arrival_deviation,waiting_time,boarding_time,ride_time,trip_time,shortest_ridetime,delay,longer_route_factor,arrival_indicator,rating,rating_puenktlichkeit,rating_sauberkeit,rating_fahrer,rating_find_modstop,rating_other_comments,cancellation_reason,cancellation_comment,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index


In [36]:
df.loc[(df['dropoff_first_eta'] - df['scheduled_to'] > pd.Timedelta(days=1)) & (df.state == 'completed')] 

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,dropoff_address,state,created_from_offer,created_at,scheduled_to,dispatched_at,pickup_arrival_time,arriving_push,vehicle_arrived_at,earliest_pickup_expectation,pickup_first_eta,pickup_eta,pickup_at,dropoff_first_eta,dropoff_eta,dropoff_at,updated_at,arrival_deviation,waiting_time,boarding_time,ride_time,trip_time,shortest_ridetime,delay,longer_route_factor,arrival_indicator,rating,rating_puenktlichkeit,rating_sauberkeit,rating_fahrer,rating_find_modstop,rating_other_comments,cancellation_reason,cancellation_comment,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index
15121,7d24a9b7-1294-44e9-8eb5-f82cd5604181,5da3eee2-350a-4588-8687-524182c16c0a,1145.0,1.0,2.66,2.66,0.0,1.0,STANDARD,12001,1007,completed,,2022-05-20 09:20:18,2022-05-23 08:20:00,2022-05-23 08:12:00,818,2022-05-23 08:12:54,2022-05-23 08:15:54,2022-05-23 08:15:00,2022-05-24 08:19:49,2022-05-23 08:17:05,2022-05-23 08:17:05,2022-05-24 08:22:06,2022-05-23 08:19:22,2022-05-23 08:19:22,2022-05-24 08:31:40,166.0,1900-01-01 00:10:38,00:00:01,00:06:00,1900-01-01 00:16:38,00:02:17.400000,1900-01-01 00:14:21,2.62,,,,,,,,,,,,,,,,


In [37]:
# Attributes: ['pickup_arrival_time', 'arrival_deviation', 'waiting_time', 'boarding_time', 'ride_time', 'trip_time', 'shortest_ridetime', 'delay', 'longer_route_factor']
def clean_time_periods(df):
    # Attribute: 'pickup_arrival_time'
    df["pickup_arrival_time"] = (
        df["vehicle_arrived_at"] - df["dispatched_at"]
    ).dt.seconds

    # Attribute: 'arrival_deviation'
    df["arrival_deviation"] = df.apply(
        lambda row: (
            (row["vehicle_arrived_at"] - row["arriving_push"]).round(freq="s")
        ).total_seconds()
        - 180
        if (row["vehicle_arrived_at"] == row["vehicle_arrived_at"])
        and (row["arriving_push"] == row["arriving_push"])
        else np.NaN,
        axis=1,
    )

    # Attribute: 'waiting_time'
    df["waiting_time"] = df.apply(
        lambda row: (
            (row["vehicle_arrived_at"] - row["earliest_pickup_expectation"]).round(
                freq="s"
            )
        ).total_seconds()
        if (row["vehicle_arrived_at"] == row["vehicle_arrived_at"])
        and (row["earliest_pickup_expectation"] == row["earliest_pickup_expectation"])
        else np.NaN,
        axis=1,
    )

    # Attribute: 'boarding_time'
    df["boarding_time"] = df.apply(
        lambda row: (
            (row["pickup_at"] - row["vehicle_arrived_at"]).round(freq="s")
        ).total_seconds()
        if (row["vehicle_arrived_at"] == row["vehicle_arrived_at"])
        and (row["pickup_at"] == row["pickup_at"])
        else np.NaN,
        axis=1,
    )

    # Attribute: 'ride_time'
    df["ride_time"] = df.apply(
        lambda row: (
            (row["dropoff_at"] - row["pickup_at"]).round(freq="s")
        ).total_seconds()
        if (row["dropoff_at"] == row["dropoff_at"])
        and (row["pickup_at"] == row["pickup_at"])
        else np.NaN,
        axis=1,
    )

    # Attribute: 'trip_time'
    df["trip_time"] = df.apply(
        lambda row: (row["ride_time"] + row["waiting_time"]),
        axis=1,
    )

    # Attribute: 'shortest_ridetime'
    df["shortest_ridetime"] = df.apply(
        lambda row: (
            pd.to_timedelta(row["shortest_ridetime"]).round(freq="s").total_seconds()
        )
        if (row["shortest_ridetime"] == row["shortest_ridetime"])
        else np.NaN,
        axis=1,
    )

    # Attribute: 'delay'
    df["delay"] = df.apply(
        lambda row: (row["trip_time"] - row["shortest_ridetime"]),
        axis=1,
    )

    # Attribute: 'longer_route_factor'
    df["longer_route_factor"] = df.apply(
        lambda row: round(row["ride_time"] / row["shortest_ridetime"], 2)
        if (row["shortest_ridetime"] != 0)
        else np.NaN,
        axis=1,
    )

    return df

In [38]:
# Attribute: 'rating'
def clean_rating(df):
    rating = df["rating"]
    rating = np.where(
        (
            df["rating"].str.match(
                r"[0-9]{1,4}.[0-9]{1,2}.[0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}"
            )
            == True
        ),
        rating.str[9].astype(float),
        df["rating"],
    )
    return rating

In [39]:
df["rating"] = clean_rating(df)
df = clean_time_periods(df)

In [40]:
# df = df.loc[df.state == 'completed']
df.to_excel(r'/Users/ericchittka/Downloads/test_cleaned.xlsx')