In [None]:
!pip install torch torchvision
!pip install torch-geometric
!pip install osmnx

Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp (from torch-geometric)
  Downloading aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Collecting aiosignal>=1.1.2 (from aiohttp->torch-geometric)
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->torch-geometric)
  Downloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.5/239.5 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multidict<7.0,>=4.5 (from aiohttp->torch-geometric)
  Downloading multidict-6.0.5

In [None]:
import osmnx as ox
from shapely.geometry import Polygon, Point
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load the map.

In [None]:
G = ox.load_graphml('/content/drive/MyDrive/RAC Project/map/perth-drive-con-unprojected.graphml')

nodes, edges = ox.graph_to_gdfs(G=G,nodes=True,edges=True)

Load data.

In [None]:
road_df = pd.read_csv('/content/drive/MyDrive/RAC Project/traffic_data/M-Links_Road_Network.csv', usecols=['LINK_DESCR', 'LINK_TO', 'LINK_ID'])
road_df['M_Link_ID'] = road_df['LINK_ID']

In [None]:
traffic_df = pd.read_csv('/content/drive/MyDrive/RAC Project/traffic_data/Total_Traffic_Volume.csv')

In [None]:
volume_df = traffic_df.merge(road_df[['M_Link_ID', 'LINK_DESCR', 'LINK_TO']], how='left', on='M_Link_ID')
volume_df = volume_df[['LINK_DESCR', 'LINK_TO', 'Volumes']]

In [None]:
def explode_link_to(df):
  df['LINK_TO'] = df['LINK_TO'].apply(lambda x: x.split('&') if isinstance(x, str) else x)
  # Explode DataFrame with handling for non-strings
  return df.explode('LINK_TO').reset_index(drop=True)

volume_df = explode_link_to(volume_df.copy())

In [None]:
def process_dataframe(df, f1, f2):
  df[f1] = df[f1].apply(lambda x: ' '.join(x.split()[:2]) if isinstance(x, str) else x)
  df[f2] = df[f2].apply(lambda x: ' '.join(x.split()[:2]) if isinstance(x, str) else x)
  return df.copy()

volume_df = process_dataframe(volume_df, 'LINK_DESCR', 'LINK_TO')

In [None]:
crash_df = pd.read_csv('/content/drive/MyDrive/RAC Project/crash_data/Crash_Information.csv',
                       usecols=['X', 'Y', 'INTERSECTION_DESC', 'SEVERITY'])
crash_df = crash_df.dropna(subset=['INTERSECTION_DESC'])

lon_values = [G.nodes[node]['x'] for node in G.nodes()]
lat_values = [G.nodes[node]['y'] for node in G.nodes()]

min_lon = min(lon_values)
max_lon = max(lon_values)
min_lat = min(lat_values)
max_lat = max(lat_values)

filtered_df = crash_df[(crash_df['X'] >= min_lon) &
                 (crash_df['X'] <= max_lon) &
                 (crash_df['Y'] >= min_lat) &
                 (crash_df['Y'] <= max_lat)]

In [None]:
def calculate_ksi(group):
    ksi_crash = group[(group['SEVERITY'] == 'Fatal') | (group['SEVERITY'] == 'Hospital')].shape[0]
    medical_crash = group[group['SEVERITY'] == 'Medical'].shape[0]
    casualty_crash = ksi_crash + medical_crash
    if casualty_crash == 0:
        return 0
    ksi_metric = ksi_crash + ksi_crash / casualty_crash * medical_crash
    return ksi_metric

# Apply the function to each group
ksi_metrics = filtered_df.groupby('INTERSECTION_DESC').apply(calculate_ksi).reset_index(name='KSI_metric')

# Merge the KSI metric back to the original dataframe
filtered_df = filtered_df.merge(ksi_metrics, on='INTERSECTION_DESC', how='left')
filtered_df = filtered_df.drop(['SEVERITY'], axis=1)

In [None]:
# Split the 'INTERSECTION_DESC' column based on '&' and create new columns
filtered_df[['MAJOR_ROAD', 'MINOR_ROAD']] = filtered_df['INTERSECTION_DESC'].str.split('&', n=1, expand=True)

# Use explode to expand the DataFrame based on 'INTERSECTION_DESC'
expanded_df = filtered_df.assign(MINOR_ROAD=filtered_df['MINOR_ROAD'].str.split('&')).explode('MINOR_ROAD')

# Reset index to maintain consecutive row numbers
expanded_df.reset_index(drop=True, inplace=True)

In [None]:
expanded_df = process_dataframe(expanded_df, 'MAJOR_ROAD', 'MINOR_ROAD')

In [None]:
expanded_df = expanded_df.drop(['CRASH_DATE', 'INTERSECTION_DESC'], axis=1)

In [None]:
merged_df = expanded_df.merge(volume_df, left_on=['MAJOR_ROAD', 'MINOR_ROAD'], right_on=['LINK_DESCR', 'LINK_TO'], how='left')
merged_df = merged_df.drop(['LINK_DESCR', 'LINK_TO'], axis=1)

In [None]:
merged_df['Volumes'].fillna(0, inplace=True)

In [None]:
compressed_df = merged_df.groupby(['X', 'Y', 'KSI_metric', 'MAJOR_ROAD', 'MINOR_ROAD', 'CRASH_TIME']).agg({'Volumes': 'mean'}).reset_index()

In [None]:
compressed_df['CRASH_RATE'] = compressed_df['KSI_metric']*10**8/compressed_df['Volumes']/1.7

In [None]:
compressed_df['CRASH_RATE'].fillna(0, inplace=True)
compressed_df.loc[compressed_df['Volumes'] == 0, 'CRASH_RATE'] = 0

In [None]:
compressed_df = compressed_df.drop(['MAJOR_ROAD', 'MINOR_ROAD', 'Volumes'], axis=1)

In [None]:
compressed_df['x'] = compressed_df['X']
compressed_df['y'] = compressed_df['Y']
compressed_df = compressed_df.drop(['X', 'Y'], axis=1)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# Normalize 'KSI_METRIC' and 'CRASH_RATE'
compressed_df[['KSI_metric', 'CRASH_RATE']] = scaler.fit_transform(compressed_df[['KSI_metric', 'CRASH_RATE']])

In [None]:
compressed_df['acc_prob'] = (compressed_df['KSI_metric'] + compressed_df['CRASH_RATE']) / \
                        (compressed_df['KSI_metric'] + compressed_df['CRASH_RATE']).clip(lower=1)

In [None]:
compressed_df = compressed_df.drop(['KSI_metric', 'CRASH_RATE'], axis=1)

In [None]:
compressed_df = compressed_df.drop(['CRASH_TIME'], axis=1)
compressed_df = compressed_df.drop_duplicates(subset=['x', 'y'])

In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# Convert accident DataFrame to GeoDataFrame
accident_geometry = [Point(xy) for xy in zip(compressed_df['x'], compressed_df['y'])]
accident_gdf = gpd.GeoDataFrame(compressed_df, geometry=accident_geometry, crs='EPSG:4326')

In [None]:
import geopandas as gpd
from shapely.geometry import Point
from sklearn.neighbors import BallTree

# Convert the x, y coordinates from nodes to a numpy array
nodes_array = np.column_stack((nodes['x'], nodes['y']))

# Build a BallTree for nearest neighbor search
tree = BallTree(nodes_array, leaf_size=15)

matched_nodes = accident_gdf.copy()

# Define a function to find the nearest point and return its geometry
def find_nearest_geometry(row):
    point = np.array([[row['x'], row['y']]])
    dist, ind = tree.query(point, k=1)
    nearest_index = ind[0][0]
    return nodes.iloc[nearest_index]['geometry']

matched_nodes['geometry'] = matched_nodes.apply(find_nearest_geometry, axis=1)
accident_nodes = gpd.GeoDataFrame(matched_nodes, geometry=matched_nodes['geometry'], crs='EPSG:4326')

In [None]:
from sklearn.model_selection import train_test_split
X = accident_nodes.drop(columns=['acc_prob'])
y = accident_nodes[['acc_prob']]
train_nodes, test_nodes, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

In [None]:
train_edges = gpd.sjoin(edges, train_nodes, how="inner", predicate="intersects")
train_edges = train_edges[['oneway', 'geometry']]
test_edges = gpd.sjoin(edges, test_nodes, how="inner", predicate="intersects")
test_edges = test_edges[['oneway', 'geometry']]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
import networkx as nx
import geopandas as gpd

G_train = nx.Graph()
for idx, row in train_nodes.iterrows():
    G_train.add_node(idx, x=row['x'], y=row['y'])
for idx, row in train_edges.iterrows():
    G_train.add_edge(row.name[0], row.name[1], oneway=row['oneway'])

G_train.add_edges_from(nx.selfloop_edges(G_train))

x = torch.tensor(train_nodes[['x', 'y']].values, dtype=torch.float)
# Ensure that node indices in the edge index are within the range of the number of nodes
edge_index = torch.tensor(np.array(list(G_train.edges())).T, dtype=torch.long)
edge_index = edge_index.remainder(len(train_nodes))  # Ensure node indices are within bounds
edge_attr = torch.tensor(train_edges['oneway'].values, dtype=torch.float)

In [None]:
# Define GNN model
class GCN(nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(2, 128)
        self.conv2 = GCNConv(128, 64)
        self.conv3 = GCNConv(64, 32)
        self.conv4 = GCNConv(32, 1)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, 0.1)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, 0.1)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, 0.1)
        x = self.conv4(x, edge_index)
        return F.log_softmax(x, dim=1)

In [None]:
y_train_label = torch.tensor(y_train.values, dtype=torch.int64)
y_train_label = torch.unsqueeze(y_train_label, dim=1)
y_test_label = torch.tensor(y_test.values, dtype=torch.int64)
y_test_label = torch.unsqueeze(y_test_label, dim=1)

In [None]:
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
loader = DataLoader([data], batch_size=1)

In [None]:
# Initialize model and optimizer
gcn_model = GCN()
optimizer = torch.optim.Adam(gcn_model.parameters(), lr=0.01)

In [None]:
# Training
gcn_model.train()
for epoch in range(1000):
    optimizer.zero_grad()
    for data in loader:
        out = gcn_model(data)

        # Define MAE loss
        loss = F.l1_loss(out, y_train_label)

        # Backpropagation
        loss.backward()
        optimizer.step()

    if (epoch+1)%100 == 0:
        torch.save(gcn_model, '/content/drive/MyDrive/RAC Project/model/prob.pth')
        print("Epoch "+str(epoch+1)+", loss: "+str(loss))

  loss = F.l1_loss(out, y_train_label)


Epoch 100, loss: tensor(0.2597, grad_fn=<MeanBackward0>)
Epoch 200, loss: tensor(0.2324, grad_fn=<MeanBackward0>)
Epoch 300, loss: tensor(0.2067, grad_fn=<MeanBackward0>)
Epoch 400, loss: tensor(0.1832, grad_fn=<MeanBackward0>)
Epoch 500, loss: tensor(0.1616, grad_fn=<MeanBackward0>)
Epoch 600, loss: tensor(0.1420, grad_fn=<MeanBackward0>)
Epoch 700, loss: tensor(0.1242, grad_fn=<MeanBackward0>)
Epoch 800, loss: tensor(0.1080, grad_fn=<MeanBackward0>)
Epoch 900, loss: tensor(0.0933, grad_fn=<MeanBackward0>)
Epoch 1000, loss: tensor(0.0802, grad_fn=<MeanBackward0>)


In [None]:
torch.save(gcn_model, '/content/drive/MyDrive/RAC Project/model/prob.pth')

In [None]:
G_test = nx.Graph(crs='EPSG:4326')
# G_test.crs
for idx, row in test_nodes.iterrows():
    G_test.add_node(idx, x=row['x'], y=row['y'])
for idx, row in test_edges.iterrows():
    G_test.add_edge(row.name[0], row.name[1], oneway=row['oneway'])

G_test.add_edges_from(nx.selfloop_edges(G_test))

x_test = torch.tensor(test_nodes[['x', 'y']].values, dtype=torch.float)
edge_index_test = torch.tensor(np.array(list(G_test.edges())).T, dtype=torch.long)
edge_index_test = edge_index_test.remainder(len(test_nodes))  # Ensure node indices are within bounds
edge_attr_test = torch.tensor(test_edges['oneway'].values, dtype=torch.float)

# Create DataLoader for test data
test_data = Data(x=x_test, edge_index=edge_index_test, edge_attr=edge_attr_test)
test_loader = DataLoader([test_data], batch_size=1)

In [None]:
gcn_model.eval()

# Perform predictions
predictions = []
for data in test_loader:
    with torch.no_grad():
        out = gcn_model(data)
        predictions.append(out)

# Convert predictions to numpy array
predictions = torch.cat(predictions).numpy()

In [None]:
num_correct = np.sum(predictions == y_test)

# Calculate the total number of samples
total_samples = len(predictions)

# Calculate accuracy
accuracy = num_correct / total_samples

print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test_values, predictions, average='weighted')  # You can also use 'micro' or 'weighted'

print(f'F1-score: {f1}')