In [1]:
import pandas as pd 
import git
import math
import numpy as np
import geopandas as gpd
import plotly.express as px
import networkx as nx
from datetime import datetime as dt
from vehicle_stream_pipeline.utils import get_shortest_ride, get_hotspots, test, calculate_drives, calculate_graph, generateRideSpecs, add_drone_flights, create_circle, generateRideSpecs, find_id_for_name, getRegressionMetrics

repo = git.Repo(".", search_parent_directories=True).git.rev_parse(
    "--show-toplevel")

ImportError: cannot import name 'test' from 'vehicle_stream_pipeline.utils' (/usr/local/lib/python3.9/site-packages/vehicle_stream_pipeline/utils.py)

In [48]:
drives = pd.read_csv(f"{repo}/data/cleaning/data_cleaned.csv")
drives = drives[(drives["state"] == "completed")]
edges = pd.read_excel(f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name= 'Liste 2022')

In [49]:
edges.rename(columns={"Start #": "start_id",
                "Ende #": "end_id"}, inplace=True)

In [43]:
df_stops = pd.read_excel(
    f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name='MoDstops')

In [52]:
startdate = drives.scheduled_to.min()
startdate = dt.strptime(startdate, '%Y-%m-%d %H:%M:%S')
startdate = startdate.strftime('%m/%d/%Y')
start_date = dt.strptime(startdate, "%m/%d/%Y")
enddate = drives.scheduled_to.max()
enddate = dt.strptime(enddate, '%Y-%m-%d %H:%M:%S')
enddate = enddate.strftime('%m/%d/%Y')
end_date = dt.strptime(enddate, "%m/%d/%Y")

In [54]:
aggregated_drives = calculate_drives(drives, start_date, end_date)

In [56]:
hotspots = get_hotspots(edges, aggregated_drives)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_edges_filtered = df_edges_filtered[df_edges_filtered.Spots != "Not in graph"]


In [36]:
sim_df_large = pd.read_csv(f"{repo}/data/simulated/sim_rides_500k.csv")


In [123]:
main_spots = [1008, 4025, 6004, 12007, 11017, 15013, 3021, 8001, 5001, 11003, 4016]

In [124]:
rides_main_routes = sim_df_large[(sim_df_large["pickup_address"].isin(main_spots)) & (sim_df_large["dropoff_address"].isin(main_spots))]

In [125]:
rides_main_routes_normal = drives[(drives["pickup_address"].isin(main_spots)) & (drives["dropoff_address"].isin(main_spots))]

In [115]:
all_main_routes = pd.concat([rides_main_routes, rides_main_routes_normal])

In [86]:
len(all_main_routes)

8104

In [121]:
find_id_for_name("Birkenweg", df_stops)

4016

In [30]:
all_main_routes.to_csv(f"{repo}/data/rides_main_routes.csv")

In [129]:
all_main_routes[["pickup_address", "dropoff_address"]].drop_duplicates().sort_values("pickup_address")

Unnamed: 0,pickup_address,dropoff_address
171,1008,6004
181,1008,11017
338,1008,4025
3866,1008,15013
3164,1008,8001
1064,1008,3021
1964,1008,12007
492176,3021,15013
106209,3021,12007
29335,3021,1008


In [134]:
def getDeliveryTimes(rides_simulated, df_edges, month_diff, drone_radius=500, only_main_routes = False):
    """Takes an input dateframe of rides (simulated or not simulated), transforms the data into graphs and applies diameter and average shortest path calculations.
        The output is used for regression of diameter and average shortest path
    Args:
        rides_simulated (DataFrame): DataFrame containing Rides in format provided by MoD
        df_edges (DataFrame): DataFrame containing stop combinations in format provided by MoD
        month_diff (int): No of original data, to divide # of rides
        drone_radius (int, optional): Radius in meter where drone can connect stops directly. Defaults to 500.
    Returns:
        List: A list with the values ["#_simulated_rides", "diameter_w/o_drones", "avg_w/o_drones", "diameter_with_drones", "avg_with_drones"] for the given dataframe
    """

    rides_simulated["scheduled_to"] = pd.to_datetime(
        rides_simulated["scheduled_to"])
    start_date = rides_simulated["scheduled_to"].min()
    end_date = rides_simulated["scheduled_to"].max()

    # rides without drones: calculate graph, diameter and average_shortest_path
    drives_without_drones = calculate_drives(
        rides_simulated, start_date, end_date)
    graph_without_drones = calculate_graph(drives_without_drones)
    # graph needs to be strongly connected to calcutate diameter
    if nx.is_strongly_connected(graph_without_drones):
        diameter = nx.diameter(graph_without_drones,
                               weight="avg_time_to_destination")
    else:
        diameter = 0
    
    if nx.is_weakly_connected(graph_without_drones):
        avg = nx.average_shortest_path_length(
            graph_without_drones, weight="avg_time_to_destination"
        )
    else:
        avg = 0

    # rides with drones: calculate graph, diameter and average_shortest_path
    if only_main_routes:
        drone_spots = []
    else:
        drone_spots = [15011, 13001, 2002, 11007,
                        4016, 1009, 3020, 9019, 9005, 4025]
                        
    drives_with_drones = add_drone_flights(
        df_edges, drives_without_drones, drone_spots=drone_spots, radius=drone_radius
    )
    # graph needs to be strongly connected to calcutate diameter
    graph_with_drones = calculate_graph(drives_with_drones)
    if nx.is_strongly_connected(graph_with_drones):
        diameter_with_drones = nx.diameter(
            graph_with_drones, weight="avg_time_to_destination"
        )
    else:
        diameter = 0

    if nx.is_weakly_connected(graph_without_drones):
        avg_with_drones = nx.average_shortest_path_length(
            graph_with_drones, weight="avg_time_to_destination"
        )

    return [
        len(rides_simulated) / month_diff,
        diameter,
        avg,
        diameter_with_drones,
        avg_with_drones,
    ]

In [135]:
def getRegressionMetrics(rides_simulated, df_edges, stepsize=15000, lower_boundary = 10000, only_main_routes = False):
    """Takes an input dateframe of rides (simulated or not simulated), applies getDeliveryTimes() for increasing sample sizes of simulated rides to built one dataframe for regression
        The output is a dataframe of increasing sample sizes of simulated rides and used for regression of diameter and average shortest path
    Args:
        rides_simulated (DataFrame): DataFrame containing Rides in format provided by MoD
        df_edges (DataFrame): DataFrame containing stop combinations in format provided by MoD
        stepsize (int, optional): Stepsize determining in which increasing order samples of the original df will be created. Defaults to 15000.
    Returns:
        DataFrame: DataFrame containing the metrics ["#_simulated_rides", "diameter_w/o_drones", "avg_w/o_drones", "diameter_with_drones", "avg_with_drones"] for increasing number of simulated rides
    """
    # month_diff used to nomalize number of rides simulated to 1 month
    rides_simulated["scheduled_to"] = pd.to_datetime(
        rides_simulated["scheduled_to"])
    start_date = rides_simulated["scheduled_to"].min()
    end_date = rides_simulated["scheduled_to"].max()
    month_diff = (
        (end_date.year - start_date.year) *
        12 + end_date.month - start_date.month
    )

    upper_boundary = len(rides_simulated)
    results_df = pd.DataFrame(
        columns=[
            "#_simulated_rides",
            "diameter_w/o_drones",
            "avg_w/o_drones",
            "diameter_with_drones",
            "avg_with_drones",
        ]
    )

    for n in list(range(lower_boundary, upper_boundary, stepsize)):
        current_sample_df = rides_simulated.sample(n=n)
        results_df.loc[len(results_df)] = getDeliveryTimes(
            current_sample_df, df_edges, month_diff, only_main_routes
        )

    return results_df

In [136]:
# execute regression metrics funciton to diameter and average_shortes_path for graphs with and without drones
regression_metrics = getRegressionMetrics(all_main_routes, edges, 100, 1000)


In [137]:
regression_metrics

Unnamed: 0,#_simulated_rides,diameter_w/o_drones,avg_w/o_drones,diameter_with_drones,avg_with_drones
0,90.909091,72.653850,23.682071,72.653850,23.682071
1,100.000000,124.521863,29.487144,124.521863,29.487144
2,109.090909,85.473142,23.166745,85.473142,23.166745
3,118.181818,87.986405,21.783757,87.986405,21.783757
4,127.272727,51.368019,17.584886,51.368019,17.584886
...,...,...,...,...,...
129,1263.636364,7.878918,2.010328,7.878918,2.010328
130,1272.727273,7.818969,2.004634,7.818969,2.004634
131,1281.818182,7.827969,1.992421,7.827969,1.992421
132,1290.909091,7.755559,1.975658,7.755559,1.975658


In [138]:
regression_metrics.to_csv(
    f"{repo}/data/regression/graph_metrics_main_routes.csv")

In [107]:
generateRideSpecs(drives, df_stops, edges, 5000, 9, 2022)

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,...,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index,Vehicle Id,shared_rides_1,shared_rides_2,shared_rides_3
0,1662578256-0,0-1662578256,3035,1.0,,,,False,VRN,1008,...,,,,,,,,,,
1,1662578256-1,1-1662578256,6874,1.0,,,,False,STANDARD,3017,...,,,,,,,,,,
2,1662578256-2,2-1662578256,6567,1.0,,,,False,VRN,1014,...,,,,,,,,,,
3,1662578256-3,3-1662578256,5041,1.0,,,,False,VRN,1010,...,,,,,,,,,,
4,1662578256-4,4-1662578256,4409,1.0,,,,True,VRN,1008,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1662578256-4995,4995-1662578256,4002,1.0,,,,False,STANDARD,4041,...,,,,,,,,,,
4996,1662578256-4996,4996-1662578256,1491,1.0,,,,False,VRN,4020,...,,,,,,,,,,
4997,1662578256-4997,4997-1662578256,3943,1.0,,,,True,STANDARD,5006,...,,,,,,,,,,
4998,1662578256-4998,4998-1662578256,3231,1.0,,,,False,STANDARD,12009,...,,,,,,,,,,
