In [1]:
from utils import *
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

%load_ext autoreload
%autoreload 2

In [2]:
# Gather all the data about the nodes
df_nodes = pd.read_csv("all_metrics_digraph.csv")
df_nodes['stop_city'] = df_nodes.apply(lambda x: f'{x["stop_I"]}_{x["city"]}', axis=1)
df_nodes.set_index('stop_city', inplace=True)

# Gather all data about links
df_links = gather_all_data('data', nodes=False)
df_links['from_stop_city'] = df_links.apply(lambda x: f'{x["from_stop_I"]}_{x["city"]}', axis=1)
df_links['to_stop_city'] = df_links.apply(lambda x: f'{x["to_stop_I"]}_{x["city"]}', axis=1)
df_l = df_links.join(df_nodes['city_center'], on='from_stop_city', how='left').rename(columns={'city_center': 'city_center_from'})
df_l = df_l.join(df_nodes['city_center'], on='to_stop_city', how='left').rename(columns={'city_center': 'city_center_to'})

In [3]:
def create_histogram_of_transport(group):
    """
    Function that will create the normalized histogram of transport
    :param group: a group of data for the same stop in a city
    :return: a list whose first element is the stop identifier and 5 next elements are the histogram of transports for tram, subway, rail, bus, ferry (in this precise order)
    """
    cur = np.zeros(5)
    for i, row in group.iterrows():
        # Build histogram
        route_t = row["route_type"]
        if route_t < 5:
            cur[route_t]+=1

    if cur.sum()==0:
        return [group.name]+list(cur)
    # Normalize histogram
    cur = cur/(cur.sum())
    return [group.name]+list(cur)

In [4]:
#Create a dataframe with the normalized histogram
test = pd.DataFrame(df_l.groupby("from_stop_city").apply(create_histogram_of_transport).tolist(),columns=["from_stop_city",'tram','subway','rail','bus','ferry']).set_index("from_stop_city")
test.head(5)

Unnamed: 0_level_0,tram,subway,rail,bus,ferry
from_stop_city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10000_brisbane,0.0,0.0,0.0,1.0,0.0
10000_melbourne,0.0,0.0,0.0,1.0,0.0
10000_paris,1.0,0.0,0.0,0.0,0.0
10000_sydney,0.0,0.0,0.0,1.0,0.0
10001_brisbane,0.0,0.0,0.0,1.0,0.0


In [5]:
# Merge the dataframe with the other one
df_nodes = df_nodes.merge(test,left_index=True,right_index=True)
df_nodes.head(5)

Unnamed: 0,stop_I,name,city_center,city,in_degree_distribution,out_degree_distribution,clustering,betweeness_centrality,eigenvector_centrality,katz_centrality,closeness_centrality,tram,subway,rail,bus,ferry
1_adelaide,1,Gawler Central Railway Station,0,adelaide,0,1,0.0,0.0,4.779277000000001e-22,0.010055,0.0,0.0,0.0,1.0,0.0,0.0
2_adelaide,2,Gawler Oval Railway Station,0,adelaide,1,1,0.0,0.0,1.816125e-20,0.011061,0.000133,0.0,0.0,1.0,0.0,0.0
3_adelaide,3,Gawler Railway Station,0,adelaide,1,3,0.166667,0.0,3.3646109999999995e-19,0.011161,0.000177,0.0,0.0,1.0,0.0,0.0
4_adelaide,4,Evanston Railway Station,0,adelaide,1,1,0.5,0.0,4.049959e-18,0.011171,0.000199,0.0,0.0,1.0,0.0,0.0
5_adelaide,5,Tambelin Railway Station,0,adelaide,2,2,0.166667,0.0,3.9664180000000006e-17,0.012288,0.000303,0.0,0.0,1.0,0.0,0.0


In [6]:
# Compute the incoming and outgoing statistics
df_stats_from = df_l.groupby("from_stop_city")[["d","n_vehicles","duration_avg"]].agg("mean").rename(columns={"d":"d_out","n_vehicles":"n_vehicles_out","duration_avg":"duration_avg_out"}).reset_index().rename(columns={"from_stop_city":"stop_city"})
df_stats_to = df_l.groupby("to_stop_city")[["d","n_vehicles","duration_avg"]].agg("mean").rename(columns={"d":"d_in","n_vehicles":"n_vehicles_in","duration_avg":"duration_avg_in"}).reset_index().rename(columns={"to_stop_city":"stop_city"})
df_stats_to = df_stats_to.merge(df_stats_from).set_index("stop_city")
df_stats_to.head(5)

Unnamed: 0_level_0,d_in,n_vehicles_in,duration_avg_in,d_out,n_vehicles_out,duration_avg_out
stop_city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000_brisbane,116.0,19.0,9.473684,153.0,19.0,37.894737
10000_melbourne,187.0,88.0,19.602273,265.0,88.0,26.488636
10000_paris,942.0,198.0,120.30303,1001.0,198.0,180.0
10000_sydney,243.0,132.0,74.090909,191.0,132.0,54.545455
10001_brisbane,122.0,18.0,30.0,193.0,18.0,30.0


In [7]:
# Merge it with the other dataframe
df_nodes = df_nodes.merge(df_stats_to,left_index=True,right_index=True)
# Our final features for each stop in the city
df_nodes.head(5)

Unnamed: 0,stop_I,name,city_center,city,in_degree_distribution,out_degree_distribution,clustering,betweeness_centrality,eigenvector_centrality,katz_centrality,...,subway,rail,bus,ferry,d_in,n_vehicles_in,duration_avg_in,d_out,n_vehicles_out,duration_avg_out
2_adelaide,2,Gawler Oval Railway Station,0,adelaide,1,1,0.0,0.0,1.816125e-20,0.011061,...,0.0,1.0,0.0,0.0,616.0,36.0,60.0,1456.0,36.0,203.333333
3_adelaide,3,Gawler Railway Station,0,adelaide,1,3,0.166667,0.0,3.3646109999999995e-19,0.011161,...,0.0,1.0,0.0,0.0,1456.0,36.0,203.333333,4528.333333,20.0,318.888889
4_adelaide,4,Evanston Railway Station,0,adelaide,1,1,0.5,0.0,4.049959e-18,0.011171,...,0.0,1.0,0.0,0.0,1506.0,36.0,176.666667,1059.0,36.0,120.0
5_adelaide,5,Tambelin Railway Station,0,adelaide,2,2,0.166667,0.0,3.9664180000000006e-17,0.012288,...,0.0,1.0,0.0,0.0,1808.0,28.5,180.0,5055.0,28.5,300.0
6_adelaide,6,Kudla Railway Station,0,adelaide,1,1,0.0,0.0,2.795385e-16,0.011284,...,0.0,1.0,0.0,0.0,3145.0,34.0,180.0,1805.0,34.0,120.0


In [8]:
import os
# Save to the data folder
df_nodes.to_csv(os.path.join("data","dhandcrafted_features.csv"))