# Walkthrough

In [52]:
from vehicle_stream_pipeline import utils, data_cleaning
import pandas as pd
import git
import os
import warnings
import time
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

warnings.filterwarnings("ignore")
repo = git.Repo(".", search_parent_directories=True).git.rev_parse(
    "--show-toplevel")

## Combining the normal rides from MoD

In [4]:
all_rides = utils.create_overall_dataframes(f"{repo}/data/normal_rides")

all_rides["df_kpi"].to_csv(f"{repo}/data/other/kpi_combined.csv")
all_rides["df_mtd"].to_csv(f"{repo}/data/other/mtd_combined.csv")
all_rides["df_rides"].to_csv(f"{repo}/data/other/rides_combined.csv")

## Cleaning the data

In [10]:
# Read in all necessary files for cleaning the data
df = pd.read_csv(f"{repo}/data/rides_combined.csv", index_col=0)
df_stops = pd.read_excel(
    f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name="MoDstops"
)
vehicle_usage_df = pd.read_excel(
    f"{repo}/data/vehicle_data/MoD_Vehicle Usage_2021+2022-05-15.xlsx"
)
external_df = pd.read_excel(
    f"{repo}/data/vehicle_data/Autofleet_Rides with External ID_2021+2022-05-15.xlsx"
)

# Eliminating duplicates
df = data_cleaning.clean_duplicates(df)

# Clean the data using our cleaning functions
df = data_cleaning.data_cleaning(df, df_stops)

# Add shared rides to our data pool
df = data_cleaning.add_shared_rides(df, vehicle_usage_df, external_df)

# Last check if data is correct if some rows are incorrect we store them in a file to analyze them and thus can adapt our cleaning script
print("check cleaned data")
df, df_incorrect = data_cleaning.data_check(df)
if df_incorrect.empty == False:
    df_incorrect.to_excel(f"{repo}/data/cleaning/incorrect{int(time.time())}.xlsx")

# Save our cleaned script. This will be used for the later use cases
df.to_csv(f"{repo}/data/cleaning/data_cleaned.csv", index=False)

clean id
clean distance
clean addresses
clean free_rides
clean created_at
clean scheduled_to
clean dispatched_at
clean vehicle_arrived_at
clean arriving_push
clean earliest_pickup_expectation
clean pickup_at
clean pickup_eta
clean pickup_first_eta
clean dropoff_at
clean dropoff_eta
clean dropoff_first_eta
clean time periods
clean rating
add shared rides
check cleaned data


## Ride Simulation

## Use case 1: Probablistic graph model - shortest path, drones

In [24]:
#First we create our aggregated drives so that for every route we calculate the average time to destination
cleaned_drives = pd.read_csv(f"{repo}/data/cleaning/data_cleaned.csv")
cleaned_drives["scheduled_to"] = pd.to_datetime(cleaned_drives["scheduled_to"])
start_date = cleaned_drives["scheduled_to"].min()
end_date = cleaned_drives["scheduled_to"].max()
aggregated_drives = utils.calculate_drives(cleaned_drives, start_date, end_date)
aggregated_drives.head()

Unnamed: 0,pickup_address,dropoff_address,number_of_drives,waiting_time,avg_ride_time,avg_time_to_destination
0,1001,1004,4,83.75,0.001704,83.751704
1,1001,1005,19,17.631579,0.001543,17.633122
2,1001,1007,2,167.5,0.002882,167.502882
3,1001,1008,3,111.666667,0.003538,111.670204
4,1001,1012,2,167.5,0.005694,167.505694


In [16]:
# Based on our aggregated drives will build a graph. The weight on an edge between two spots is defined as the avg_time_to_destination
graph = utils.calculate_graph(aggregated_drives)

<networkx.classes.digraph.DiGraph at 0x11f692c10>

In [33]:
path, time_to_destination = utils.get_shortest_ride(1001, 4018, graph)
print(f"The shortest path uses the following stops {path} and takes {int(time_to_destination)} days")

The shortest path uses the following stops [1001, 1005, 4018] and takes 185 days


In [25]:
edges = pd.read_excel(f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name= 'Liste 2022')
hotspots = utils.get_hotspots(edges, aggregated_drives, n = 10)
print(f"The following spots are the hotspots of our graph: {hotspots}")

In [30]:
drone_spots = [1008, 4025, 6004, 12007, 11017, 15013, 3021, 8001, 5001, 11003, 4016]
aggregated_drives_with_drone_flights = utils.add_drone_flights(edges, aggregated_drives, drone_spots, radius = 500)
aggregated_drives_with_drone_flights.tail()

Unnamed: 0,pickup_address,dropoff_address,avg_time_to_destination
38793,15012,15013,0.000451
38991,15013,15010,0.000536
38993,15013,15012,0.000451
38994,15013,15014,0.000747
39194,15014,15013,0.000747


In [31]:
graph_with_drone_flights = utils.calculate_graph(aggregated_drives_with_drone_flights)

In [34]:
path, time_to_destination = utils.get_shortest_ride(1001, 4018, graph_with_drone_flights)
print(f"The shortest path uses the following stops {path} and takes {int(time_to_destination)} days")

The shortest path uses the following stops [1001, 1005, 4025, 1010, 4017, 4016, 4018] and takes 110 days


## Use case 2: Feasibility Analysis using our ride simulation

In [36]:
simulated_rides = pd.read_csv(f"{repo}/data/simulated/sim_rides_500k.csv")
regression_metrics = utils.getRegressionMetrics(simulated_rides, edges)
regression_metrics

In [41]:
needed_rides = utils.get_rides_num(5, regression_metrics, "avg_w/o_drones")

In [46]:
# Build regression_metrics for main routes
main_spots = [1008, 4025, 6004, 12007, 11017, 15013, 3021, 8001, 5001, 11003, 4016]
rides_main_routes = simulated_rides[(simulated_rides["pickup_address"].isin(main_spots)) & (simulated_rides["dropoff_address"].isin(main_spots))]
regression_metrics_main_routes = utils.getRegressionMetrics(rides_main_routes, edges, 100, 1000)
regression_metrics_main_routes

Unnamed: 0,#_simulated_rides,diameter_w/o_drones,avg_w/o_drones,diameter_with_drones,avg_with_drones
0,90.909091,226.640917,49.882300,226.640917,49.882300
1,100.000000,403.072699,64.040244,403.072699,64.040244
2,109.090909,169.103503,43.955881,169.103503,43.955881
3,118.181818,151.193304,36.526191,151.193304,36.526191
4,127.272727,211.471644,39.533122,211.471644,39.533122
...,...,...,...,...,...
154,1490.909091,15.739725,3.376965,15.739725,3.376965
155,1500.000000,15.629014,3.335916,15.629014,3.335916
156,1509.090909,15.627056,3.327226,15.627056,3.327226
157,1518.181818,15.627214,3.316836,15.627214,3.316836


In [47]:
needed_rides_main_routes = utils.get_rides_num(5, regression_metrics_main_routes, "avg_w/o_drones")

In [48]:
needed_rides_fig1 = px.scatter(
    regression_metrics_main_routes,
    x="avg_w/o_drones",
    y="#_simulated_rides",
    color_discrete_sequence=["DarkKhaki"],
    title="Break Even of Rides",
    range_x=[0, 20],
)
needed_rides_fig1["data"][0]["name"] = "Simulated Rides Data"
needed_rides_fig1["data"][0]["showlegend"] = False
# Line Plot of Regressed Data
needed_rides_fig2 = px.line(
    x=regression_metrics_main_routes["avg_w/o_drones"],
    y=utils.regression_function(
        regression_metrics_main_routes["avg_w/o_drones"],
        *utils.get_opt_parameter(regression_metrics_main_routes, "avg_w/o_drones"),
    ),
    color_discrete_sequence=["DarkCyan"],
    range_x=[0, 20],
)
needed_rides_fig2["data"][0]["name"] = "Regression of Rides Data"
needed_rides_fig2["data"][0]["showlegend"] = False
# Line Plot working as cursor for current max days
needed_rides_fig3 = px.line(
    x=[5, 5], y=[0, needed_rides_main_routes], color_discrete_sequence=["tomato"]
)
needed_rides_fig3["data"][0]["name"] = "Max Days for Delivery"
needed_rides_fig3["data"][0]["showlegend"] = False
needed_rides_fig4 = px.line(
    x=[0, 5],
    y=[needed_rides_main_routes, needed_rides_main_routes],
    color_discrete_sequence=["tomato"],
)

needed_rides_fig = go.Figure(
    data=needed_rides_fig1.data
    + needed_rides_fig2.data
    + needed_rides_fig3.data
    + needed_rides_fig4.data,
    layout=needed_rides_fig1.layout,
)
needed_rides_fig

In [55]:
# Calculate needed drivers
if needed_rides - len(cleaned_drives) > 0:
    simulated_rides_1 = simulated_rides.sample(int(needed_rides - len(cleaned_drives)))
    total_rides = pd.concat([cleaned_drives, simulated_rides_1])
else:
    total_rides = cleaned_drives.sample(int(needed_rides))

hours = list(range(0, 24))
numb_drivers_per_hour = []
avg_drives_per_hour = []
for i in hours:
    numb_drivers_per_hour.append(
        utils.calculate_number_drivers(total_rides, i, 0.3)[0]
    )
    avg_drives_per_hour.append(
        utils.calculate_number_drivers(total_rides, i, 0.3)[1]
    )

df_drivers_per_hour = pd.DataFrame(
    list(zip(hours, numb_drivers_per_hour)), columns=["hour", "drivers"]
)

df_drives_per_hour = pd.DataFrame(
    list(zip(hours, avg_drives_per_hour)), columns=["hour", "drives"]
)

fig_drivers = px.bar(df_drivers_per_hour, x="hour", y="drivers")
fig_drives = px.bar(df_drives_per_hour, x="hour", y="drives")
fig_drives.show()
fig_drivers.show()