In [None]:
import networkx as nx
import matplotlib.pyplot as plt

def open_line_file(filename):
    with open(f'./lines_from_gtfs_static_data/{filename}.txt', 'r') as f:
      lines = f.readlines()
    return lines

line_files = [
    'Red-C1-0',
    'Red-C1-1',
    'Red-C2-0',
    'Red-C2-1',
    'Green-B-C1-0',
    'Green-B-C1-1',
    'Green-C-C1-0',
    'Green-C-C1-1',
    'Green-D-C1-0',
    'Green-D-C1-1',
    'Green-E-C1-0',
    'Green-E-C1-1',
    'Orange-C1-0',
    'Orange-C1-1',
    'Blue-C1-0',
    'Blue-C1-1'
]

line_stations = map(open_line_file, line_files)

stations = set()
edges = set()

for line in line_stations:
    for i in range(len(line)-1):
        stop_name = line[i].rstrip()
        next_stop_name = line[i+1].rstrip()
        stations.add(stop_name)
        edges.add((stop_name, next_stop_name))
    end_stop_name = line[-1].rstrip()
    stations.add(end_stop_name)

network = nx.Graph()
network.add_nodes_from(stations)
network.add_edges_from(edges)

fig = plt.figure(figsize = (10,10))
layout = nx.kamada_kawai_layout(network)
nx.draw(network, layout, node_size=50)

In [None]:
import json

delays_file = open("./delay_data/named_rapid_transit_delays.json", 'r')
delays_json = delays_file.read()
delays_file.close()

delays = json.loads(delays_json)
fig, ax = plt.subplots()
hist = ax.hist(delays.values(), bins=100)

We see that most of the delays are around zero, with a handful of extremely negative outliers. We remove the outliers to get a more balanced histogram:

In [None]:
# remove outliers
delays = dict((s, d) for s, d in delays.items() if d > -5000)

fig, ax = plt.subplots()
hist = ax.hist(delays.values(), bins=100)

### Now we add the cleaned delay data as node attributes in NetworkX, and see the results

In [None]:
nx.set_node_attributes(network, delays, "avg_delay")
nx.get_node_attributes(network, "avg_delay")

# proof of concept for coloring nodes; for now just split into positive and negative delays
# later we can split into bins
nonnegative_delay_nodes = [n for (n, delay) in nx.get_node_attributes(network, "avg_delay").items() if delay >= 0]
negative_delay_nodes = [n for (n, delay) in nx.get_node_attributes(network, "avg_delay").items() if delay < 0]
remaining_nodes = list(set(network.nodes()) - set(nonnegative_delay_nodes) - set(negative_delay_nodes))

fig = plt.figure(figsize = (10,10))
layout = nx.kamada_kawai_layout(network)
nx.draw_networkx_nodes(network, layout, nodelist=nonnegative_delay_nodes, node_color='red', node_size=50)
nx.draw_networkx_nodes(network, layout, nodelist=negative_delay_nodes, node_color='green', node_size=50)
nx.draw_networkx_nodes(network, layout, nodelist=remaining_nodes, node_color='blue', node_size=50)
nx.draw_networkx_edges(network, layout)

In [None]:
avg_delay = nx.get_node_attributes(network, "avg_delay")

# Calculate the Katz centrality for each node
katz_centrality = nx.katz_centrality(network, alpha=0.1)


# Nothing compare the average delay with the Katz centrality for each node
for (node, delay) in nx.get_node_attributes(network, "avg_delay").items():
    print(f"Node {node}:")
    print(f"  Average delay: {delay}")
    print(f"  Katz centrality: {katz_centrality[node]}")


print(avg_delay.values())

In [None]:
import pandas as pd

df = pd.DataFrame(list(zip(avg_delay.values(), katz_centrality.values())), columns=["avg_delay", "katz_centrality"])

# Calculate the correlation coefficient between the two variables
correlation = df["avg_delay"].corr(df["katz_centrality"])

# Print the correlation coefficient
print("Correlation Coefficient of avg_delay and katz centrality: " + str(correlation))

# Create a scatter plot of the two variables
plt.scatter(df["avg_delay"], df["katz_centrality"])
plt.xlabel("avg_delay")
plt.ylabel("katz_centrality")

# Show the plot
plt.show()