# Data preprocessing

In [1]:
import os
import random
import matplotlib # type: ignore
import numpy as np # type: ignore
import pandas as pd # type: ignore
import networkx as nx # type: ignore
import geopandas as gpd # type: ignore
import matplotlib.pyplot as plt # type: ignore
from shapely.geometry import Point, Polygon, MultiPolygon # type: ignore
from typing import Tuple

# Set working directory
while os.path.basename(os.getcwd()).lower() != 'carsharingmodelcasestudy':
    os.chdir('..')
assert os.path.basename(os.getcwd()).lower() == 'carsharingmodelcasestudy', os.getcwd()

# Pandas settings
pd.options.mode.chained_assignment = None  # Suppress SettingWithCopyWarning
pd.set_option('display.max_columns', None) # Show all columns

In [2]:
import pyrosm # type: ignore
import osmnx as ox # type: ignore
self = pyrosm.OSM.__init__ # Initialize the OSM object 
osm = pyrosm.OSM("./data/OSM/Copenhagen.osm.pbf")
# get stations from the data
data_folder = 'data/'
stations = pd.read_csv(data_folder + '20_css_cop_latlng.csv', sep=';', index_col=0)

In [3]:
# Get walking network
walk_net = osm.get_network(network_type="walking", nodes=True)
walk_nodes, walk_edges = walk_net
G_walk = osm.to_graph(walk_nodes, walk_edges, graph_type="networkx")
# Get POIs
pois = osm.get_pois()

## Categorizing POIs based on purpose

We have analyzed the tags and created the following groupings

In [4]:
commute_1_shop = pois["shop"].value_counts().index.to_list() # Work
commute_2_amenity = ["school", "college", "university"] # Education
errands_1 = [] # Home, perm. residence
errands_2_amenity = ["bus_station", ] # Escorting to/from transport
errands_2_tags = [{"public_transport": "station"}]
errands_3 = [] # Collect/bring objects
errands_4_shop = pois["shop"].value_counts().index.to_list() # Shopping
errands_5_amenity = ["hospital", "pharmacy"] # Health
leisure_1 = [] # Home
leisure_2_ammenity = ['school', 'community_centre', 'library', 'arts_centre', 'university', 'college'] # After-school, youth club
leisure_3_ammenity = ['kindergarten', 'community_centre', 'social_facility'] # Nursery, créche, day care
leisure_4 = [] # Visit family/friends
leisure_5_ammenity = ['park'] # Sports
leisure_6_ammenity = ['place_of_worship', 'theatre', 'cinema', 'nightclub', 'casino', 'music_venue', 'hookah_lounge', 'gambling'] # Entertainment
leisure_6_religion = ['christian', 'muslim', 'jewish', 'buddhist', 'scientologist']
leisure_6_museum = ['art', 'history', 'local', 'military']
leisure_7 = [] # Allotment/summer cottage
leisure_8 = [] # Leisure round trip
leisure_8_tourism = ['museum', 'attraction', 'viewpoint']
leisure_9_ammenity = ['restaurant', 'cafe', 'pub'] # Holiday, excursion

In [5]:
# We create a new column which holds lists of relevant groups
pois['relevant_groups'] = None

In [6]:
def update_relevant_group_column(pois_df, column_name, list_values, append_group):
    # Create a boolean column indicating whether the condition is met
    pois_df['isin_bool'] = pois_df[column_name].isin(list_values)
    # Update the 'relevant_groups' column
    pois_df['relevant_groups'] = pois_df.apply(
        lambda row: list(set((row['relevant_groups'] if isinstance(row['relevant_groups'], list) else []) + [append_group]))
        if row['isin_bool'] else row['relevant_groups'],
        axis=1
    )
    #print(len(pois_df[pois_df['relevant_groups'].notnull()]['relevant_groups']))
    return pois_df.drop(columns=['isin_bool'])

pois = update_relevant_group_column(pois, 'shop', commute_1_shop, 'commute_1_shop')
pois = update_relevant_group_column(pois, 'amenity', commute_2_amenity, 'commute_2_amenity')
pois = update_relevant_group_column(pois, 'amenity', errands_2_amenity, 'errands_2_amenity')
pois = update_relevant_group_column(pois, 'shop', errands_4_shop, 'errands_4_shop')
pois = update_relevant_group_column(pois, 'amenity', errands_5_amenity, 'errands_5_amenity')
pois = update_relevant_group_column(pois, 'amenity', leisure_2_ammenity, 'leisure_2_ammenity')
pois = update_relevant_group_column(pois, 'amenity', leisure_3_ammenity, 'leisure_3_ammenity')
pois = update_relevant_group_column(pois, 'amenity', leisure_5_ammenity, 'leisure_5_ammenity')
pois = update_relevant_group_column(pois, 'amenity', leisure_6_ammenity, 'leisure_6_ammenity')
pois = update_relevant_group_column(pois, 'religion', leisure_6_religion, 'leisure_6_religion')
pois = update_relevant_group_column(pois, 'museum', leisure_6_museum, 'leisure_6_museum')
pois = update_relevant_group_column(pois, 'tourism', leisure_8_tourism, 'leisure_8_tourism')
pois = update_relevant_group_column(pois, 'amenity', leisure_9_ammenity, 'leisure_9_ammenity')


In [None]:
# To handle errands_2_tags we need to parse the tags column and then update the relevant_groups
# Parse tags to dictionary
def parse_tags(x):
    if isinstance(x, dict):
        return x
    elif isinstance(x, str):
        try:
            return eval(x)
        except (ValueError, SyntaxError):
            return None
    else:
        return None

# Create a list of indices where the tag "public_transport" is in the tags and the value is "station"
pois['parsed_tags'] = pois['tags'].apply(parse_tags)

# Update the 'relevant_groups' column for rows where 'parsed_tags' contains {'public_transport': 'station'}, 
pois['relevant_groups'] = pois.apply(
    lambda row: (
        list(set((row['relevant_groups'] if isinstance(row['relevant_groups'], list) else []) + ["errands_2_tags"]))
        if isinstance(row['parsed_tags'], dict) and row['parsed_tags'].get('public_transport') == 'station'
        else row['relevant_groups']
    ),
    axis=1
)
print(len(pois[pois['relevant_groups'].notnull()]['relevant_groups']))

In [8]:
# We only want to keep the rows where 'relevant_groups' is not None
pois_filtered = pois[pois['relevant_groups'].notnull()]

# Create controids for POIs
def get_centroid(geometry):
    if isinstance(geometry, Point):
        return geometry
    elif isinstance(geometry, (Polygon, MultiPolygon)):
        return geometry.centroid

# Filter out None values in the 'centroid' column
pois_filtered['centroid'] = pois_filtered['geometry'].apply(get_centroid)

In [None]:
print(len(pois_filtered))
pois_filtered = pois_filtered[pois_filtered['centroid'].notnull()]
print(len(pois_filtered))

In [10]:
pois_filtered['centroid_lon'] = pois_filtered['centroid'].apply(lambda point: point.x)
pois_filtered['centroid_lat'] = pois_filtered['centroid'].apply(lambda point: point.y)

In [11]:
# Prepare GeoDataFrames for POIs and stations
pois_gdf = gpd.GeoDataFrame(
    pois_filtered,
    geometry=gpd.points_from_xy(pois_filtered['centroid_lon'], pois_filtered['centroid_lat']),
    crs="EPSG:4326"
)

stations_gdf = gpd.GeoDataFrame(
    stations,
    geometry=gpd.points_from_xy(stations['lng'], stations['lat']),
    crs="EPSG:4326"
)

poi_coords = list(zip(pois_gdf.geometry.x, pois_gdf.geometry.y))
station_coords = list(zip(stations_gdf.geometry.x, stations_gdf.geometry.y))

poi_nodes = ox.distance.nearest_nodes(G_walk, X=[x for x, y in poi_coords], Y=[y for x, y in poi_coords])
station_nodes = ox.distance.nearest_nodes(G_walk, X=[x for x, y in station_coords], Y=[y for x, y in station_coords])

station_node_set = set(station_nodes)
distance_threshold = 1000
filtered_pois = []

In [12]:
# Loop over each POI node to compute distances to stations
for idx, poi_node in enumerate(poi_nodes):
    # Compute shortest path lengths from the POI node to all other nodes within the threshold
    lengths = nx.single_source_dijkstra_path_length(G_walk, poi_node, cutoff=distance_threshold, weight='length')
    
    # Find the minimum distance to any station node
    min_distance = float('inf')
    for station_node in station_node_set:
        if station_node in lengths:
            distance = lengths[station_node]
            if distance < min_distance:
                min_distance = distance
                
    # If the minimum distance is within the threshold, keep the POI
    if min_distance < distance_threshold:
        filtered_pois.append(pois_filtered.iloc[idx])

In [13]:
# loop over the filtered_pois to create a dataframe
filtered_pois_df = pd.DataFrame(filtered_pois)

In [14]:
# replace leisure_6_ammenity, leisure_6_religion, leisure_6_museum with leisure_6
filtered_pois_df['relevant_groups'] = filtered_pois_df['relevant_groups'].apply(
    lambda x: ['leisure_6'] if 'leisure_6_ammenity' in x or 'leisure_6_religion' in x or 'leisure_6_museum' in x else x)

In [None]:
# Define a function to plot stations and POIs
def plot_stations_pois(stations, pois_filtered):
    # Convert stations and POIs to GeoDataFrames if they aren't already
    stations_gdf = gpd.GeoDataFrame(stations, geometry=gpd.points_from_xy(stations['lng'], stations['lat']), crs="EPSG:4326")
    pois_filtered_gdf = gpd.GeoDataFrame(pois_filtered, geometry=gpd.points_from_xy(pois_filtered['centroid_lon'], pois_filtered['centroid_lat']), crs="EPSG:4326")
    # Convert the 'relevant_groups' column to tuples, create a 'primary_category' column, and get unique categories
    pois_filtered_gdf['relevant_groups'] = pois_filtered_gdf['relevant_groups'].apply(tuple)
    pois_filtered_gdf['primary_category'] = pois_filtered_gdf['relevant_groups'].apply(lambda x: x[0])
    categories = pois_filtered_gdf['primary_category'].unique()
    # Create a colormap
    cmap_name = "Paired"
    cmap = matplotlib.colormaps[cmap_name]
    num_categories = len(categories)
    colors = cmap.colors[:num_categories] if hasattr(cmap, 'colors') else cmap(range(num_categories))
    listed_cmap = matplotlib.colors.ListedColormap(colors)
    category_to_color = {category: listed_cmap(i) for i, category in enumerate(categories)}
    pois_filtered_gdf['color'] = pois_filtered_gdf['primary_category'].map(category_to_color)
    # Plot the map
    fig, ax = plt.subplots(figsize=(12, 10))
    # Plot the road network as a background
    drive_net = osm.get_network(network_type="driving", nodes=True)
    drive_nodes, drive_edges = drive_net
    drive_edges.plot(ax=ax, linewidth=0.5, color="gray", alpha=0.4)
    # Plot filtered POIs with another color and marker
    pois_filtered_gdf.plot(ax=ax, color=pois_filtered_gdf['color'], marker="o", markersize=5, alpha=0.5, label="POIs")
    # Plot stations with a specific color and marker
    stations_gdf.plot(ax=ax, color="blue", marker="o", markersize=45, label="Stations")
    # Set the limits of the plot
    x_min, x_max = 12.35, 12.75
    y_min, y_max = 55.585, 55.785
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    # Add legend and title
    plt.legend()
    #plt.title("Stations and Filtered POIs in Copenhagen")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    # Set a dark background
    background_color = "#FCFAFC" #00111A
    ax.set_facecolor(background_color) 
    fig.patch.set_facecolor((0, 0, 0, 0))
    plt.savefig('LaTeX/images/stations_with_pois_near.png', format='png', dpi=300, transparent=False)
    # Show the plot
    plt.show()

# Call the function with your stations and filtered POIs data
plot_stations_pois(stations, filtered_pois_df)

## Create synthetic traveller origin-destination data

In [52]:
nodes_df = filtered_pois_df.copy()

In [53]:
nodes_df_lats = nodes_df['centroid_lat'].values
nodes_df_lons = nodes_df['centroid_lon'].values
nodes_df['node'] = ox.distance.nearest_nodes(G_walk, nodes_df_lons, nodes_df_lats)

In [None]:
nodes_df["relevant_groups"].value_counts()

In [55]:
nodes_df['relevant_groups'] = nodes_df['relevant_groups'].apply(
    lambda x: x if isinstance(x, list) else [])

In [56]:
# replace leisure_6_ammenity, leisure_6_religion, leisure_6_museum with leisure_6
nodes_df['relevant_groups'] = nodes_df['relevant_groups'].apply(
    lambda x: ['leisure_6'] if 'leisure_6_ammenity' in x or 'leisure_6_religion' in x or 'leisure_6_museum' in x else x)

In [57]:
group_probabilities = {
    'commute_1_shop': 0.134,
    'commute_2_amenity': 0.049,
    'errands_2_amenity': 0.01,
    'errands_4_shop': 0.196,
    'errands_5_amenity': 0.034,
    'leisure_2_ammenity': 0.03,
    'leisure_3_ammenity': 0.02,
    'leisure_5_ammenity': 0.034,
    'leisure_6': 0.058,
    'leisure_8_tourism': 0.16,
    'leisure_9_ammenity': 0.036
}

In [73]:
# Create a dictionary mapping group names to POI nodes
group_to_nodes = {
    group: nodes_df[nodes_df['relevant_groups'].apply(lambda x: group in x)]
              [['node', 'centroid_lon', 'centroid_lat']]
              .values.tolist()
    for group in group_probabilities
}
# Remove empty groups
group_to_nodes = {k: v for k, v in group_to_nodes.items() 
        if len(v) > 0}
groups = [group_name for group_name in group_probabilities.keys() 
        if group_name in group_to_nodes.keys()]
# Probability of selecting each group
probabilities = [group_probabilities[group] for group in groups]
probabilities = np.array(probabilities)
probabilities = probabilities / probabilities.sum()  # Normalize
cumulative_probabilities = np.cumsum(probabilities)
# As an example, a single trip destination can be drawn as follows
trip_group = groups[np.searchsorted(cumulative_probabilities, random.random())]
trip_destination = random.choice(group_to_nodes.get(trip_group, []))

In [None]:
# print the length of each group
for group, nodes in group_to_nodes.items():
    print(f"{group}: {len(nodes)}")

In [None]:
print(len(groups))
print(groups)
print(probabilities)
print(cumulative_probabilities)

In [27]:
def walking_distance(origin_node, destination_node):
    try:
        length = nx.shortest_path_length(G_walk, origin_node, destination_node, weight='length')
    except nx.NetworkXNoPath:
        length = 0
    return length

In [None]:
num_trips = 2500
origins = []
destinations = []
trip_groups = []

for _ in range(num_trips):
    # Get groups for origin and destination nodes
    origin_rand = random.random()
    origin_group_index = np.searchsorted(cumulative_probabilities, origin_rand)
    origin_group = groups[origin_group_index]
    dest_rand = random.random()
    dest_group_index = np.searchsorted(cumulative_probabilities, dest_rand)
    destination_group = groups[dest_group_index]

    # Get nodes based on groups
    origin_nodes = group_to_nodes.get(origin_group, [])
    dest_nodes = group_to_nodes.get(destination_group, [])

    # Ensure nodes are available in both groups
    if not origin_nodes:
        print(f"No nodes available for group {origin_group}")
        raise ValueError("No nodes available for origin group")
    if not dest_nodes:
        print(f"No nodes available for group {destination_group}")
        raise ValueError("No nodes available for destination group")
        

    # Select origin and destination nodes from groups
    distance_between_nodes = 0
    while distance_between_nodes < 1000: # Ensure the distance is at least 1 km
        origin = random.choice(origin_nodes)
        destination = random.choice(dest_nodes)
        print(f"Origin: {origin}")
        print(f"Destination: {destination}")
        distance_between_nodes = walking_distance(origin[0], destination[0])
        print(f"Distance between nodes: {distance_between_nodes} (node {_} of {num_trips})")

    # Record the trip data
    origins.append(origin)
    destinations.append(destination)
    trip_groups.append((origin_group, destination_group))

In [36]:
# Create the OD DataFrame
od_data = pd.DataFrame({
    "origin_lon": [x[1] for x in origins],
    "origin_lat": [x[2] for x in origins],
    "destination_lon": [x[1] for x in destinations],
    "destination_lat": [x[2] for x in destinations],
    "trip_group": trip_groups[:len(origins)],
    "trip_id": np.arange(1, len(origins)+1)
})

In [37]:
# write the od_data to a csv file
od_data.to_csv('requests/od_data_2500.csv', sep=';', encoding='utf-8')