# Walkthrough

In [4]:
#from vehicle_stream_pipeline import utils
from vehicle_stream_pipeline.utils import data_cleaning as dc
from vehicle_stream_pipeline.utils import feasibility_analysis as fa
from vehicle_stream_pipeline.utils import prob_model as pm
from vehicle_stream_pipeline.utils import ride_simulation as rs

import pandas as pd
import git
import os
import warnings
import time
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

warnings.filterwarnings("ignore")
repo = git.Repo(".", search_parent_directories=True).git.rev_parse(
    "--show-toplevel")

## Combining the normal rides from MoD

In [5]:
all_rides = dc.create_overall_dataframes(f"{repo}/data/normal_rides")

all_rides["df_kpi"].to_csv(f"{repo}/data/other/kpi_combined.csv")
all_rides["df_mtd"].to_csv(f"{repo}/data/other/mtd_combined.csv")
all_rides["df_rides"].to_csv(f"{repo}/data/other/rides_combined.csv")

## Cleaning the data

In [7]:
# Read in all necessary files for cleaning the data
df = pd.read_csv(f"{repo}/data/other/rides_combined.csv", index_col=0)
df_stops = pd.read_excel(
    f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name="MoDstops"
)
vehicle_usage_df = pd.read_excel(
    f"{repo}/data/vehicle_data/MoD_Vehicle Usage_2021+2022-05-15.xlsx"
)
external_df = pd.read_excel(
    f"{repo}/data/vehicle_data/Autofleet_Rides with External ID_2021+2022-05-15.xlsx"
)

# Eliminating duplicates
df = dc.clean_duplicates(df)

# Clean the data using our cleaning functions
df = dc.data_cleaning(df, df_stops)

# Add shared rides to our data pool
df = dc.add_shared_rides(df, vehicle_usage_df, external_df)

# Last check if data is correct if some rows are incorrect we store them in a file to analyze them and thus can adapt our cleaning script
print("check cleaned data")
df, df_incorrect = dc.data_check(df)
if df_incorrect.empty == False:
    df_incorrect.to_excel(f"{repo}/data/cleaning/incorrect{int(time.time())}.xlsx")

# Save our cleaned script. This will be used for the later use cases
df.to_csv(f"{repo}/data/cleaning/data_cleaned.csv", index=False)

clean id
clean distance
clean addresses


KeyboardInterrupt: 

## Ride Simulation

## Use case 1: Probablistic graph model - shortest path, drones

In [8]:
#First we create our aggregated drives so that for every route we calculate the average time to destination
cleaned_drives = pd.read_csv(f"{repo}/data/cleaning/data_cleaned.csv")
cleaned_drives["scheduled_to"] = pd.to_datetime(cleaned_drives["scheduled_to"])
start_date = cleaned_drives["scheduled_to"].min()
end_date = cleaned_drives["scheduled_to"].max()
aggregated_drives = pm.calculate_drives(cleaned_drives, start_date, end_date)
aggregated_drives.head()

Unnamed: 0,pickup_address,dropoff_address,number_of_drives,waiting_time,avg_ride_time,avg_time_to_destination
0,1001,1004,4,83.75,0.001704,83.751704
1,1001,1005,19,17.631579,0.001543,17.633122
2,1001,1007,2,167.5,0.002882,167.502882
3,1001,1008,3,111.666667,0.003538,111.670204
4,1001,1012,2,167.5,0.005694,167.505694


In [9]:
# Based on our aggregated drives will build a graph. The weight on an edge between two spots is defined as the avg_time_to_destination
graph = pm.calculate_graph(aggregated_drives)

In [10]:
path, time_to_destination = pm.get_shortest_ride(1001, 4018, graph)
print(f"The shortest path uses the following stops {path} and takes {int(time_to_destination)} days")

The shortest path uses the following stops [1001, 1005, 4018] and takes 185 days


In [12]:
edges = pd.read_excel(f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name= 'Liste 2022')
hotspots = pm.get_hotspots(edges, aggregated_drives, n = 10)
print(f"The following spots are the hotspots of our graph: {hotspots}")

The following spots are the hotspots of our graph: [1008, 4025, 1005, 1009, 1007, 12007, 7001, 6004, 1010, 11017]


In [13]:
drone_spots = [1008, 4025, 6004, 12007, 11017, 15013, 3021, 8001, 5001, 11003, 4016]
aggregated_drives_with_drone_flights = pm.add_drone_flights(edges, aggregated_drives, drone_spots, radius = 500)
aggregated_drives_with_drone_flights.tail()

Unnamed: 0,pickup_address,dropoff_address,avg_time_to_destination
38793,15012,15013,0.000451
38991,15013,15010,0.000536
38993,15013,15012,0.000451
38994,15013,15014,0.000747
39194,15014,15013,0.000747


In [14]:
graph_with_drone_flights = pm.calculate_graph(aggregated_drives_with_drone_flights)

In [15]:
path, time_to_destination = pm.get_shortest_ride(1001, 4018, graph_with_drone_flights)
print(f"The shortest path uses the following stops {path} and takes {int(time_to_destination)} days")

The shortest path uses the following stops [1001, 1005, 4025, 1010, 4017, 4016, 4018] and takes 110 days


## Use case 2: Feasibility Analysis using our ride simulation

In [16]:
simulated_rides = pd.read_csv(f"{repo}/data/simulated/sim_rides_500k.csv")
regression_metrics = fa.getRegressionMetrics(simulated_rides, edges)
regression_metrics

Unnamed: 0,#_simulated_rides,diameter_w/o_drones,avg_w/o_drones,diameter_with_drones,avg_with_drones
0,909.090909,788.286237,240.121443,788.286237,240.121443
1,2272.727273,566.82432,114.041593,566.82432,114.041593
2,3636.363636,351.428939,78.554808,351.428939,78.554808
3,5000.0,346.182608,57.883003,346.182608,57.883003
4,6363.636364,311.140444,46.528476,311.140444,46.528476
5,7727.272727,236.965661,39.925927,236.965661,39.925927
6,9090.909091,231.169627,34.328291,231.169627,34.328291
7,10454.545455,210.173713,30.096059,210.173713,30.096059
8,11818.181818,199.637274,27.19874,199.637274,27.19874
9,13181.818182,168.480334,24.882718,168.480334,24.882718


In [18]:
needed_rides = fa.get_rides_num(5, regression_metrics, "avg_w/o_drones")

In [19]:
# Build regression_metrics for main routes
main_spots = [1008, 4025, 6004, 12007, 11017, 15013, 3021, 8001, 5001, 11003, 4016]
rides_main_routes = simulated_rides[(simulated_rides["pickup_address"].isin(main_spots)) & (simulated_rides["dropoff_address"].isin(main_spots))]
regression_metrics_main_routes = fa.getRegressionMetrics(rides_main_routes, edges, 100, 1000)
regression_metrics_main_routes

Unnamed: 0,#_simulated_rides,diameter_w/o_drones,avg_w/o_drones,diameter_with_drones,avg_with_drones
0,90.909091,140.520376,42.478167,140.520376,42.478167
1,100.000000,394.381531,67.565746,394.381531,67.565746
2,109.090909,418.758542,62.769783,418.758542,62.769783
3,118.181818,127.413133,37.196231,127.413133,37.196231
4,127.272727,347.423449,54.158921,347.423449,54.158921
...,...,...,...,...,...
154,1490.909091,16.429578,3.429338,16.429578,3.429338
155,1500.000000,16.125935,3.397624,16.125935,3.397624
156,1509.090909,16.013559,3.358964,16.013559,3.358964
157,1518.181818,15.626391,3.315844,15.626391,3.315844


In [20]:
needed_rides_main_routes = fa.get_rides_num(2, regression_metrics_main_routes, "avg_w/o_drones")

In [21]:
needed_rides_fig1 = px.scatter(
    regression_metrics_main_routes,
    x="avg_w/o_drones",
    y="#_simulated_rides",
    color_discrete_sequence=["DarkKhaki"],
    title="Break Even of Rides",
    range_x=[0, 20],
)
needed_rides_fig1["data"][0]["name"] = "Simulated Rides Data"
needed_rides_fig1["data"][0]["showlegend"] = False
# Line Plot of Regressed Data
needed_rides_fig2 = px.line(
    x=regression_metrics_main_routes["avg_w/o_drones"],
    y=fa.regression_function(
        regression_metrics_main_routes["avg_w/o_drones"],
        *fa.get_opt_parameter(regression_metrics_main_routes, "avg_w/o_drones"),
    ),
    color_discrete_sequence=["DarkCyan"],
    range_x=[0, 20],
)
needed_rides_fig2["data"][0]["name"] = "Regression of Rides Data"
needed_rides_fig2["data"][0]["showlegend"] = False
# Line Plot working as cursor for current max days
needed_rides_fig3 = px.line(
    x=[2, 2], y=[0, needed_rides_main_routes], color_discrete_sequence=["tomato"]
)
needed_rides_fig3["data"][0]["name"] = "Max Days for Delivery"
needed_rides_fig3["data"][0]["showlegend"] = False
needed_rides_fig4 = px.line(
    x=[0, 2],
    y=[needed_rides_main_routes, needed_rides_main_routes],
    color_discrete_sequence=["tomato"],
)

needed_rides_fig = go.Figure(
    data=needed_rides_fig1.data
    + needed_rides_fig2.data
    + needed_rides_fig3.data
    + needed_rides_fig4.data,
    layout=needed_rides_fig1.layout,
)
needed_rides_fig

In [22]:
# Calculate needed drivers
if needed_rides_main_routes - len(cleaned_drives) > 0:
    simulated_rides_1 = simulated_rides.sample(int(needed_rides_main_routes - len(cleaned_drives)))
    total_rides = pd.concat([cleaned_drives, simulated_rides_1])
else:
    total_rides = cleaned_drives.sample(int(needed_rides_main_routes))

hours = list(range(0, 24))
numb_drivers_per_hour = []
avg_drives_per_hour = []
for i in hours:
    numb_drivers_per_hour.append(
        fa.calculate_number_drivers(total_rides, i, 0.3)[0]
    )
    avg_drives_per_hour.append(
        fa.calculate_number_drivers(total_rides, i, 0.3)[1]
    )

df_drivers_per_hour = pd.DataFrame(
    list(zip(hours, numb_drivers_per_hour)), columns=["hour", "drivers"]
)

df_drives_per_hour = pd.DataFrame(
    list(zip(hours, avg_drives_per_hour)), columns=["hour", "drives"]
)

fig_drivers = px.bar(df_drivers_per_hour, x="hour", y="drivers")
fig_drives = px.bar(df_drives_per_hour, x="hour", y="drives")
fig_drives.show()
fig_drivers.show()