In [1]:
!pip install torch torchvision
!pip install torch-geometric
!pip install osmnx

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
import osmnx as ox
from shapely.geometry import Polygon, Point
import pandas as pd
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load the map.

In [4]:
G = ox.load_graphml('/content/drive/MyDrive/RAC/perth-drive-con-unprojected.graphml')

nodes, edges = ox.graph_to_gdfs(G=G,nodes=True,edges=True)

In [5]:
nodes.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 27946 entries, 0 to 27940
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   osmid_original  27946 non-null  object  
 1   y               27946 non-null  float64 
 2   x               27946 non-null  float64 
 3   street_count    27946 non-null  int64   
 4   lon             20903 non-null  float64 
 5   lat             20903 non-null  float64 
 6   highway         185 non-null    object  
 7   geometry        27946 non-null  geometry
dtypes: float64(4), geometry(1), int64(1), object(2)
memory usage: 1.9+ MB


In [6]:
nodes.head(10)

Unnamed: 0_level_0,osmid_original,y,x,street_count,lon,lat,highway,geometry
osmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1577031431,-32.050319,115.733562,3,115.733562,-32.050319,,POINT (115.73356 -32.05032)
1,25587710,-32.048072,115.735056,3,115.735056,-32.048072,,POINT (115.73506 -32.04807)
1078,6207919999,-32.050641,115.734231,3,115.734231,-32.050641,,POINT (115.73423 -32.05064)
2,25587739,-32.046925,115.735934,3,115.735934,-32.046925,,POINT (115.73593 -32.04692)
3,25587759,-32.04686,115.732605,3,115.732605,-32.04686,,POINT (115.73260 -32.04686)
4,"[25587740, 5846127771, 5846127770]",-32.046362,115.736475,0,,,,POINT (115.73647 -32.04636)
5,25587741,-32.045499,115.735516,3,115.735516,-32.045499,,POINT (115.73552 -32.04550)
8,"[31052695, 1577031487]",-32.046694,115.73765,0,,,,POINT (115.73765 -32.04669)
6,"[5368400610, 5368400607, 5368400609, 536840061...",-32.042844,115.737782,0,,,,POINT (115.73778 -32.04284)
1123,5846114891,-32.041602,115.739042,3,115.739042,-32.041602,,POINT (115.73904 -32.04160)


Load data.

In [7]:
crash_df = pd.read_csv('/content/drive/MyDrive/RAC/Crash_Information.csv', usecols=['X', 'Y', 'ACCIDENT_TYPE', 'SEVERITY'])

lon_values = [G.nodes[node]['x'] for node in G.nodes()]
lat_values = [G.nodes[node]['y'] for node in G.nodes()]

min_lon = min(lon_values)
max_lon = max(lon_values)
min_lat = min(lat_values)
max_lat = max(lat_values)

filtered_df = crash_df[(crash_df['X'] >= min_lon) &
                 (crash_df['X'] <= max_lon) &
                 (crash_df['Y'] >= min_lat) &
                 (crash_df['Y'] <= max_lat)]

In [8]:
intersection_df = filtered_df[filtered_df['ACCIDENT_TYPE'] == 'Intersection']

In [9]:
intersection_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53950 entries, 0 to 127198
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   X              53950 non-null  float64
 1   Y              53950 non-null  float64
 2   ACCIDENT_TYPE  53950 non-null  object 
 3   SEVERITY       53950 non-null  object 
dtypes: float64(2), object(2)
memory usage: 2.1+ MB


In [10]:
def calculate_ksi(group):
    ksi_crash = group[(group['SEVERITY'] == 'Fatal') | (group['SEVERITY'] == 'Hospital')].shape[0]
    medical_crash = group[group['SEVERITY'] == 'Medical'].shape[0]
    casualty_crash = ksi_crash + medical_crash
    if casualty_crash == 0:
        return 0
    ksi_metric = ksi_crash + ksi_crash / casualty_crash * medical_crash
    return ksi_metric

# Apply the function to each group
ksi_metrics = intersection_df.groupby(['X', 'Y']).apply(calculate_ksi).reset_index(name='KSI_metric')

# Merge the KSI metric back to the original dataframe
intersection_df = intersection_df.merge(ksi_metrics, on=['X', 'Y'], how='left')
intersection_df = intersection_df.drop(['SEVERITY'], axis=1)

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# Normalize 'KSI_METRIC'
norm_df = intersection_df.copy()
norm_df[['KSI_metric']] = scaler.fit_transform(norm_df[['KSI_metric']])

In [12]:
norm_df[['x', 'y']] = norm_df[['X', 'Y']].values
norm_df = norm_df.drop(['X', 'Y','ACCIDENT_TYPE'], axis=1)

In [13]:
uni_df = norm_df.drop_duplicates(subset=['x', 'y'])

In [14]:
high = uni_df['KSI_metric'].quantile(0.95)
med = uni_df['KSI_metric'].quantile(0.9)
low = uni_df['KSI_metric'].quantile(0.85)

# Define a function to categorize values as 'high' or 'low'
def categorize(value, high, med, low):
    if value >= high:
        return 'high'
    elif value >= med:
        return 'medium'
    elif value >= low:
        return 'low'
    else:
        return 'safe'

uni_df['risk_lvl'] = uni_df['KSI_metric'].apply(lambda x: categorize(x, high, med, low))

print(uni_df['risk_lvl'].value_counts())

risk_lvl
safe      7092
low        523
medium     501
high       451
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uni_df['risk_lvl'] = uni_df['KSI_metric'].apply(lambda x: categorize(x, high, med, low))


In [15]:
uni_df = uni_df.drop(['KSI_metric'], axis=1)

In [16]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'risk_lvl' column
uni_df['risk_lvl_encoded'] = label_encoder.fit_transform(uni_df['risk_lvl'])

# Define the mapping manually
label_mapping = {'safe': 0, 'low': 1, 'medium': 2, 'high': 3}

# Map the labels to their encoded values
uni_df['risk_lvl_encoded'] = uni_df['risk_lvl'].map(label_mapping)

In [17]:
uni_df = uni_df.drop(['risk_lvl'], axis=1)

In [18]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# Convert accident DataFrame to GeoDataFrame
accident_geometry = [Point(xy) for xy in zip(uni_df['x'], uni_df['y'])]
accident_gdf = gpd.GeoDataFrame(uni_df, geometry=accident_geometry, crs='EPSG:4326')

In [19]:
import geopandas as gpd
from shapely.geometry import Point
from sklearn.neighbors import BallTree

# Convert the x, y coordinates from nodes to a numpy array
nodes_array = np.column_stack((nodes['x'], nodes['y']))

# Build a BallTree for nearest neighbor search
tree = BallTree(nodes_array, leaf_size=15)

matched_nodes = accident_gdf.copy()

# Define a function to find the nearest point and return its geometry
def find_nearest_geometry(row):
    point = np.array([[row['x'], row['y']]])
    dist, ind = tree.query(point, k=1)
    nearest_index = ind[0][0]
    return nodes.iloc[nearest_index]['geometry']

matched_nodes['geometry'] = matched_nodes.apply(find_nearest_geometry, axis=1)
accident_nodes = gpd.GeoDataFrame(matched_nodes, geometry=matched_nodes['geometry'], crs='EPSG:4326')

In [20]:
from sklearn.model_selection import train_test_split
X = accident_nodes.drop(columns=['risk_lvl_encoded'])
y = accident_nodes[['risk_lvl_encoded']]
train_nodes, test_nodes, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

In [21]:
train_edges = gpd.sjoin(edges, train_nodes, how="inner", predicate="intersects")
train_edges = train_edges[['oneway', 'geometry']]
test_edges = gpd.sjoin(edges, test_nodes, how="inner", predicate="intersects")
test_edges = test_edges[['oneway', 'geometry']]

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
import networkx as nx
import geopandas as gpd

G_train = nx.Graph()
for idx, row in train_nodes.iterrows():
    G_train.add_node(idx, x=row['x'], y=row['y'])
for idx, row in train_edges.iterrows():
    G_train.add_edge(row.name[0], row.name[1], oneway=row['oneway'])

G_train.add_edges_from(nx.selfloop_edges(G_train))

x = torch.tensor(train_nodes[['x', 'y']].values, dtype=torch.float)
# Ensure that node indices in the edge index are within the range of the number of nodes
edge_index = torch.tensor(np.array(list(G_train.edges())).T, dtype=torch.long)
edge_index = edge_index.remainder(len(train_nodes))  # Ensure node indices are within bounds
edge_attr = torch.tensor(train_edges['oneway'].values, dtype=torch.float)

In [27]:
# Define GNN model
class GCN(nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(2, 64)
        self.conv2 = GCNConv(64, 32)
        self.conv3 = GCNConv(32, 4)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, 0.1)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, 0.1)
        x = self.conv3(x, edge_index)
        return F.log_softmax(x, dim=1)  # Apply softmax activation function along the output dimension


In [24]:
y_train_label = torch.tensor(y_train.values, dtype=torch.int64)
y_train_label = torch.unsqueeze(y_train_label, dim=1)
y_test_label = torch.tensor(y_test.values, dtype=torch.int64)
y_test_label = torch.unsqueeze(y_test_label, dim=1)

In [25]:
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
loader = DataLoader([data], batch_size=1)

In [None]:
# # Initialize model and optimizer
gcn_model = GCN()
optimizer = torch.optim.Adam(gcn_model.parameters(), lr=0.01)

In [28]:
gcn_model = torch.load('/content/drive/MyDrive/RAC/model_7.pth')

In [None]:
# Training
gcn_model.train()
for epoch in range(500):
    optimizer.zero_grad()
    for data in loader:
        out = gcn_model(data)
        # Define MAE loss
        loss = F.l1_loss(out, y_train_label)

        # Backpropagation
        loss.backward()
        optimizer.step()

    if (epoch+1)%50 == 0:
        torch.save(gcn_model, '/content/drive/MyDrive/RAC/model_7.pth')
        print("Epoch "+str(epoch+1)+", loss: "+str(loss))

  loss = F.l1_loss(out, y_train_label)


Epoch 50, loss: tensor(1.7294, grad_fn=<MeanBackward0>)
Epoch 100, loss: tensor(1.7231, grad_fn=<MeanBackward0>)
Epoch 150, loss: tensor(1.7229, grad_fn=<MeanBackward0>)
Epoch 200, loss: tensor(1.7223, grad_fn=<MeanBackward0>)
Epoch 250, loss: tensor(1.7230, grad_fn=<MeanBackward0>)
Epoch 300, loss: tensor(1.7220, grad_fn=<MeanBackward0>)
Epoch 350, loss: tensor(1.7218, grad_fn=<MeanBackward0>)
Epoch 400, loss: tensor(1.7218, grad_fn=<MeanBackward0>)
Epoch 450, loss: tensor(1.7218, grad_fn=<MeanBackward0>)
Epoch 500, loss: tensor(1.7218, grad_fn=<MeanBackward0>)


In [None]:
# gcn_model = torch.load('/content/drive/MyDrive/RAC Project/model/model_2000.pth')

In [29]:
G_test = nx.Graph(crs='EPSG:4326')
# G_test.crs
for idx, row in test_nodes.iterrows():
    G_test.add_node(idx, x=row['x'], y=row['y'])
for idx, row in test_edges.iterrows():
    G_test.add_edge(row.name[0], row.name[1], oneway=row['oneway'])

G_test.add_edges_from(nx.selfloop_edges(G_test))

x_test = torch.tensor(test_nodes[['x', 'y']].values, dtype=torch.float)
edge_index_test = torch.tensor(np.array(list(G_test.edges())).T, dtype=torch.long)
edge_index_test = edge_index_test.remainder(len(test_nodes))  # Ensure node indices are within bounds
edge_attr_test = torch.tensor(test_edges['oneway'].values, dtype=torch.float)

# Create DataLoader for test data
test_data = Data(x=x_test, edge_index=edge_index_test, edge_attr=edge_attr_test)
test_loader = DataLoader([test_data], batch_size=1)

In [45]:
x_val = torch.tensor(nodes[['x', 'y']].values, dtype=torch.float)
edge_index_val = torch.tensor(np.array(list(G.edges())).T, dtype=torch.long)
edge_index_val = edge_index_val.remainder(len(nodes))  # Ensure node indices are within bounds
edge_attr_val = torch.tensor(edges['oneway'].values, dtype=torch.float)

# Create DataLoader for test data
val_data = Data(x=x_val, edge_index=edge_index_val, edge_attr=edge_attr_val)
val_loader = DataLoader([val_data], batch_size=1)

In [41]:
gcn_model.eval()

# Perform predictions
predictions = []
for data in test_loader:
    with torch.no_grad():
        out = gcn_model(data)
        predicted_classes = out.argmax(dim=1)

        predictions.append(predicted_classes)

# Convert predictions to numpy array
predictions = torch.cat(predictions).numpy()

In [33]:
gcn_model.eval()

# Perform predictions
predictions = []
for data in loader:
    with torch.no_grad():
        out = gcn_model(data)
        predicted_classes = out.argmax(dim=1)

        predictions.append(predicted_classes)

# Convert predictions to numpy array
predictions_train = torch.cat(predictions).numpy()

In [34]:
y_train_values = y_train.values.flatten()  # Convert to a 1D array

# Assuming predictions and y_test_values are numpy arrays
num_correct = np.sum(predictions_train == y_train_values)

# Calculate the total number of samples
total_samples = len(predictions_train)

# Calculate accuracy
accuracy = num_correct / total_samples
print("Accuracy:", accuracy)

Accuracy: 0.8289800087552897


In [39]:
from sklearn.metrics import f1_score

f1 = f1_score(y_train_values, predictions_train, average='weighted')  # You can also use 'micro' or 'weighted'

print(f'F1-score: {f1}')

F1-score: 0.752270559574432


In [42]:
y_test_values = y_test.values.flatten()  # Convert to a 1D array

# Assuming predictions and y_test_values are numpy arrays
num_correct = np.sum(predictions == y_test_values)

# Calculate the total number of samples
total_samples = len(predictions)

# Calculate accuracy
accuracy = num_correct / total_samples
print("Accuracy:", accuracy)

Accuracy: 0.8208868144690782


In [43]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test_values, predictions, average='weighted')  # You can also use 'micro' or 'weighted'

print(f'F1-score: {f1}')

F1-score: 0.7401395373008606


In [46]:
gcn_model.eval()

# Perform predictions
predictions = []
for data in val_loader:
    with torch.no_grad():
        out = gcn_model(data)
        predicted_classes = out.argmax(dim=1)
        # predicted_classes = torch.where(predicted_classes == 0, 5, predicted_classes)
        # predicted_classes = torch.where(predicted_classes == 4, 0, predicted_classes)
        # predicted_classes = torch.where(predicted_classes == 5, 4, predicted_classes)

        predictions.append(predicted_classes)

# Convert predictions to numpy array
predictions = torch.cat(predictions).numpy()

In [47]:
nodes['risk_lvl'] = predictions
nodes['osmid'] = nodes['osmid_original']

In [None]:
nodes.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 27946 entries, 0 to 27940
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   osmid_original  27946 non-null  object  
 1   y               27946 non-null  float64 
 2   x               27946 non-null  float64 
 3   street_count    27946 non-null  int64   
 4   lon             20903 non-null  float64 
 5   lat             20903 non-null  float64 
 6   highway         185 non-null    object  
 7   geometry        27946 non-null  geometry
 8   risk_lvl        27946 non-null  int64   
 9   osmid           27946 non-null  object  
dtypes: float64(4), geometry(1), int64(2), object(3)
memory usage: 2.3+ MB


In [None]:
nodes[['osmid', 'risk_lvl']].to_csv('predicted_nodes.csv', index=False)

In [48]:
from collections import Counter
count = Counter(predictions)

# Print the counts for each class from 0 to 4
for i in range(5):
    print(f"Class {i}: {count[i]}")

Class 0: 27946
Class 1: 0
Class 2: 0
Class 3: 0
Class 4: 0


In [None]:
from sklearn.metrics import r2_score

# Assuming y_true contains the true target values and y_pred contains the predicted target values

# Calculate R2 score
r2 = r2_score(y_test, predictions)

print("R2 score:", r2)

R2 score: -0.23363716522718314


In [None]:
y_test_values = y_test.values.flatten()  # Convert to a 1D array

# Assuming predictions and y_test_values are numpy arrays
num_correct = np.sum(predictions == y_test_values)

# Calculate the total number of samples
total_samples = len(predictions)

# Calculate accuracy
accuracy = num_correct / total_samples
print("Accuracy:", accuracy)

ValueError: operands could not be broadcast together with shapes (27946,) (1714,) 