# Walkthrough

In [2]:
from vehicle_stream_pipeline import utils, data_cleaning
import pandas as pd
import git
import os
import warnings
import time

warnings.filterwarnings("ignore")
repo = git.Repo(".", search_parent_directories=True).git.rev_parse(
    "--show-toplevel")

## Combing the data
First, we combine the monthly excel sheets into mutliple big csv files and store them. We create three seperate files.
1. kpi_combined.csv: That is the monthly kpi-stats combined. We rarely use this data 

2. mtd_combined.csv: That (should) contain all the rides combined for each day of the month according to excel sheet. 

3. rides_combined: Here we iterated over each day (excel sheet) and collected the data for each day on our own. Suprisingly this is different to the mtd_combined.csv and seems like that this data is more accurate. So we will use this dataframe for further analysis.

With this function you can also build a new dataframe when MoD uploaded data for the upcoming months. Just store them in the data/normal_rides folder. 

(Takes about 20 seconds)



In [4]:
all_rides = utils.create_overall_dataframes(f"{repo}/data/normal_rides")

all_rides["df_kpi"].to_csv(f"{repo}/data/other/kpi_combined.csv")
all_rides["df_mtd"].to_csv(f"{repo}/data/other/mtd_combined.csv")
all_rides["df_rides"].to_csv(f"{repo}/data/other/rides_combined.csv")

## Cleaning the data

In [10]:
# Read in all necessary files for cleaning the data
df = pd.read_csv(f"{repo}/data/rides_combined.csv", index_col=0)
df_stops = pd.read_excel(
    f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name="MoDstops"
)
vehicle_usage_df = pd.read_excel(
    f"{repo}/data/vehicle_data/MoD_Vehicle Usage_2021+2022-05-15.xlsx"
)
external_df = pd.read_excel(
    f"{repo}/data/vehicle_data/Autofleet_Rides with External ID_2021+2022-05-15.xlsx"
)

# Eliminating duplicates
df = data_cleaning.clean_duplicates(df)

# Clean the data using our cleaning functions
df = data_cleaning.data_cleaning(df, df_stops)

# Add shared rides to our data pool
df = data_cleaning.add_shared_rides(df, vehicle_usage_df, external_df)

# Last check if data is correct if some rows are incorrect we store them in a file to analyze them and thus can adapt our cleaning script
print("check cleaned data")
df, df_incorrect = data_cleaning.data_check(df)
if df_incorrect.empty == False:
    df_incorrect.to_excel(f"{repo}/data/cleaning/incorrect{int(time.time())}.xlsx")

# Save our cleaned script. This will be used for the later use cases
df.to_csv(f"{repo}/data/cleaning/data_cleaned.csv", index=False)

clean id
clean distance
clean addresses
clean free_rides
clean created_at
clean scheduled_to
clean dispatched_at
clean vehicle_arrived_at
clean arriving_push
clean earliest_pickup_expectation
clean pickup_at
clean pickup_eta
clean pickup_first_eta
clean dropoff_at
clean dropoff_eta
clean dropoff_first_eta
clean time periods
clean rating
add shared rides
check cleaned data


## Ride Simulation

## Use case 1: Probablistic graph model - shortest path, drones

In [24]:
#First we create our aggregated drives so that for every route we calculate the average time to destination
cleaned_drives = pd.read_csv(f"{repo}/data/cleaning/data_cleaned.csv")
cleaned_drives["scheduled_to"] = pd.to_datetime(cleaned_drives["scheduled_to"])
start_date = cleaned_drives["scheduled_to"].min()
end_date = cleaned_drives["scheduled_to"].max()
aggregated_drives = utils.calculate_drives(cleaned_drives, start_date, end_date)
aggregated_drives.head()

Unnamed: 0,pickup_address,dropoff_address,number_of_drives,waiting_time,avg_ride_time,avg_time_to_destination
0,1001,1004,4,83.75,0.001704,83.751704
1,1001,1005,19,17.631579,0.001543,17.633122
2,1001,1007,2,167.5,0.002882,167.502882
3,1001,1008,3,111.666667,0.003538,111.670204
4,1001,1012,2,167.5,0.005694,167.505694


In [16]:
# Based on our aggregated drives will build a graph. The weight on an edge between two spots is defined as the avg_time_to_destination
graph = utils.calculate_graph(aggregated_drives)

<networkx.classes.digraph.DiGraph at 0x11f692c10>

In [23]:
path, time_to_destination = utils.get_shortest_ride(1001, 1004, graph)
print(f"The shortest path uses the following stops {path} and takes {int(time_to_destination)} days")

The shortest path uses the following stops [1001, 1005, 4025, 1008, 11017, 1004] and takes 62 days


In [25]:
edges = pd.read_excel(f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name= 'Liste 2022')
hotspots = utils.get_hotspots(edges, aggregated_drives, n = 10)
print(f"The following spots are the hotspots of our graph: {hotspots}")

In [27]:
drone_spots = [
                1008,
                4025,
                6004,
                12007,
                11017,
                15013,
                3021,
                8001,
                5001,
                11003,
                4016,
            ]
utils.add_drone_flights(edges, aggregated_drives, , radius = 500)

'The following spots are the hotspots of our graph: [1008, 4025, 1005, 1009, 1007, 12007, 7001, 6004, 1010, 11017]'