# Create Pytorch Geometric Data Objects

In [1]:
import pandas as pd
import osmnx as ox
import networkx as nx
from tqdm import tqdm
import geopandas as gpd
from utils import createPytorchData

## Cargar datos de santiago y mapa de OSM

In [2]:
nodos_santiago = pd.read_csv('Data/dataset_santiago_ismt.csv')
nodos_santiago.head(2)

Unnamed: 0,latlong,beautiful,boring,depressing,lively,safe,wealthy,lat,lon,node_id,division,prom_ismt,hogares,hog_40pct,pct_hog40p,AVE_GSE
0,"-33.323944,-70.51263428391168",-0.306948,1.565049,0.572029,-1.137733,-0.120456,-0.561887,-33.323944,-70.512634,1396118148,Lo Barnechea,0.915742,1087.0,10,0.0092,ABC1
1,"-33.323944,-70.5127291",-0.421388,0.309495,0.368965,-0.098733,-0.103042,-0.162294,-33.323944,-70.512729,1396118148,Lo Barnechea,0.915742,1087.0,10,0.0092,ABC1


In [16]:
### commune filter
# nodos_santiago = nodos_santiago[nodos_santiago.division.isin(['Santiago', 'Providencia', 'Ñuñoa', 'San Joaquín', 'San Miguel', 'Pedro Aguirre Cerda', 'Estación Central',
#                                                               'Quinta Normal', 'Independencia', 'Recoleta'])]
# nodos_santiago.division.value_counts()

division
Santiago               4468
Ñuñoa                  3357
Providencia            2750
Recoleta               2740
Estación Central       2581
Quinta Normal          2447
San Miguel             1965
San Joaquín            1931
Pedro Aguirre Cerda    1808
Independencia          1417
Name: count, dtype: int64

### Cambiar nombre comuna a int

In [3]:
nodos_santiago['division_num'] = pd.Categorical(nodos_santiago['division']).codes
print(nodos_santiago[['division', 'division_num']].drop_duplicates()[:5])

          division  division_num
0     Lo Barnechea            17
3341      Vitacura            41
3856        Colina             4
3893     Quilicura            30
4817         Lampa            15


In [4]:
num2div = {}
div2num = {}
for _, row in nodos_santiago[['division', 'division_num']].drop_duplicates().iterrows():
    num2div[row.division_num] = row.division
    div2num[row.division] = row.division_num

In [5]:
import pickle
with open('Data/Comunas/division2numbers.pkl', 'wb') as fp:
    pickle.dump(div2num, fp)
with open('Data/Comunas/numbers2division.pkl', 'wb') as fp:
    pickle.dump(num2div, fp)

### Cambiar categoría GSE a int

In [19]:
nodos_santiago['ave_gse_num'] = pd.Categorical(nodos_santiago['AVE_GSE']).codes
print(nodos_santiago[['AVE_GSE', 'ave_gse_num']].drop_duplicates()[:5])

      AVE_GSE  ave_gse_num
10538       D            3
18666      C2            1
23497    ABC1            0
28019      C3            2


In [20]:
num2gse = {}
gse2num = {}
for _, row in nodos_santiago[['AVE_GSE', 'ave_gse_num']].drop_duplicates().iterrows():
    num2gse[row.ave_gse_num] = row.AVE_GSE
    gse2num[row.AVE_GSE] = row.ave_gse_num

In [8]:
import pickle
with open('Data/ISMT/gse2numbers.pkl', 'wb') as fp:
    pickle.dump(gse2num, fp)
with open('Data/ISMT/numbers2gse.pkl', 'wb') as fp:
    pickle.dump(num2gse, fp)

In [21]:
G = ox.load_graphml("Maps/santiago_drive.graphml")
print(f'Number of nodes: {len(G.nodes())}')
print(f'Number of edges: {len(G.edges())}')
print(f'Number of strongly conncected components: {nx.number_strongly_connected_components(G)}')

Number of nodes: 355936
Number of edges: 673565
Number of strongly conncected components: 2450


In [22]:
components_length = [len(list(component)) for component in list(nx.strongly_connected_components(G))]
components_length.sort(reverse=True)
print(f"{round(components_length[0]/len(G.nodes()),5) * 100}% of nodes in one component")

99.259% of nodes in one component


## Add values to the complementary nodes of the OSM Graph

In [21]:
def get_nodes_attrs(G):
    attributes = {}
    for node, attrs in G.nodes.data():
        #print(f"Atributos de {node}: {', '.join(attrs.keys())}")
        for att in attrs.keys():
            if att not in attributes:
                attributes[att] = 0
            attributes[att] += 1
    return attributes

def get_edges_attrs(G):
    edges_atts = {}
    for u, v, attr in G.edges.data():
        for att in attr.keys():
            if att not in edges_atts:
                edges_atts[att] = 0
            edges_atts[att] += 1
    return edges_atts

In [28]:
# Se agregan los atributos como "boring", "lively" etc. a los nodos existentes que se encuentran en el grafo G
for _, row in nodos_santiago.iterrows():
    G.add_node(row.node_id, beautiful=row.beautiful, boring=row.boring,
               depressing=row.depressing, lively=row.lively,
               safe=row.safe, wealthy=row.wealthy, division=row.division_num, ismt=row.prom_ismt,
               houses=row.hogares, hog_40pct=row.hog_40pct, pct_hog40p=row.pct_hog40p, ave_gse= row.ave_gse_num)

In [29]:
get_nodes_attrs(G)

{'beautiful': 17976,
 'boring': 17976,
 'depressing': 17976,
 'lively': 17976,
 'safe': 17976,
 'wealthy': 17976,
 'division': 17976,
 'ismt': 17976,
 'houses': 17976,
 'hog_40pct': 17976,
 'pct_hog40p': 17976,
 'ave_gse': 17976}

In [18]:
# Eliminar atributos extras de OSM
for node in G.nodes:
    if 'ref' in G.nodes[node]:
        G.nodes[node].pop('ref', None)
    if 'highway' in G.nodes[node]:
        G.nodes[node].pop('highway', None)
    if 'street_count' in G.nodes[node]:
        G.nodes[node].pop('street_count', None)

In [19]:
for u, v, attr in G.edges.data():
    for name in ['osmid', 'oneway', 'lanes', 'name', 'access', 'maxspeed', 'ref', 'bridge', 'junction', 'width', 'tunnel', 'osmid' 'oneway', 'highway', 'reversed', 'speed_kph']:
        if name in attr:
            del attr[name]

In [20]:
get_nodes_attrs(G)

{'y': 355936,
 'x': 355936,
 'beautiful': 84252,
 'boring': 84252,
 'depressing': 84252,
 'lively': 84252,
 'safe': 84252,
 'wealthy': 84252,
 'division': 84252,
 'ismt': 84252,
 'houses': 84252,
 'hog_40pct': 84252,
 'pct_hog40p': 84252,
 'ave_gse': 84252}

In [42]:
df_comunas = gpd.read_file('Data/Comunas/COMUNAS_2020.shp')
df_ismt = gpd.read_file('Data/ISMT/ISMT.shp')
santiago_division = df_comunas[df_comunas.CUT_REG == '13']

def getDivision(point):
    for div in santiago_division.geometry:
        if point.within(div):
            return div
        
def getIsmtData(point):
    for com in df_ismt.geometry:
        if point.within(com):
            return com

def setDivisionNum(division: str):
    return div2num[division]

def setGSENum(gse):
    if gse in ['ABC1', 'C2', 'C3', 'D', 'E']:
        return gse2num[gse]
    return

def geoDataProcess(df_extra):
    # Obtener latitud y longitug
    lats = []
    lons = []
    for _, row in df_extra.iterrows():
        node = row.node_id
        lats.append(G.nodes[node]['y'])
        lons.append(G.nodes[node]['x'])
    df_extra['lat'] = lats
    df_extra['lon'] = lons

    coords_point = gpd.GeoDataFrame(df_extra, geometry=gpd.points_from_xy(df_extra.lon,df_extra.lat))
    coords_point['division'] = coords_point.geometry.apply(getDivision)
    tqdm.pandas()
    coords_point['geo_ismt'] = coords_point.geometry.progress_apply(getIsmtData)
    
    df_merge = coords_point.merge(santiago_division[['geometry', 'COMUNA']], left_on='division', right_on='geometry')
    df_merge = df_merge.drop(['geometry_x', 'geometry_y', 'division'], axis=1)
    df_merge = df_merge.rename({'COMUNA':'division'}, axis=1)
    df_merge['division_num'] = df_merge.division.apply(setDivisionNum)
    
    df_merge = df_merge.merge(df_ismt[['geometry', 'prom_ismt', 'hogares', 'hog_40pct', 'pct_hog40p', 'AVE_GSE']], left_on='geo_ismt', right_on='geometry', how='left')
    df_extra = df_merge.drop(['geometry', 'geo_ismt'], axis=1)
    df_extra = df_extra.rename({'AVE_GSE':'ave_gse'}, axis=1)
    df_extra['ave_gse_num'] = df_extra.ave_gse.apply(setGSENum)

    return df_extra

In [43]:
ls_nodos_santiago = list(nodos_santiago.node_id.unique())
extra_nodes = list(set(G.nodes()) - set(ls_nodos_santiago))
len(extra_nodes) + 84252

355936

## Zero Map

In [44]:
# Setear valores de nodos extras en 0
ls_nodos_santiago = list(nodos_santiago.node_id.unique())
extra_nodes = list(set(G.nodes()) - set(ls_nodos_santiago))
print(f"Number of nodes: {len(extra_nodes)}")
beautiful_values = [0] * len(extra_nodes)
boring_values = [0] * len(extra_nodes)
depressing_values = [0] * len(extra_nodes)
lively_values = [0] * len(extra_nodes)
safe_values = [0] * len(extra_nodes)
wealthy_values = [0] * len(extra_nodes)

columns_name = ['node_id', 'beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy']
df_extra = pd.DataFrame(list(zip(extra_nodes, beautiful_values, boring_values, depressing_values,
                                 lively_values, safe_values, wealthy_values)), columns=columns_name)


df_extra = geoDataProcess(df_extra)

Number of nodes: 271684


100%|██████████| 271684/271684 [47:13<00:00, 95.90it/s] 


In [45]:
df_extra

Unnamed: 0,node_id,beautiful,boring,depressing,lively,safe,wealthy,lat,lon,division,division_num,prom_ismt,hogares,hog_40pct,pct_hog40p,ave_gse,ave_gse_num
0,9565110273,0,0,0,0,0,0,-33.600032,-70.591930,Puente Alto,29,0.851613,1305.0,206.0,0.157854,C2,1.0
1,9565110274,0,0,0,0,0,0,-33.599970,-70.591940,Puente Alto,29,0.851613,1305.0,206.0,0.157854,C2,1.0
2,2393618954,0,0,0,0,0,0,-33.580565,-70.605937,Puente Alto,29,0.778635,1336.0,683.0,0.511228,D,3.0
3,9565110275,0,0,0,0,0,0,-33.600001,-70.591939,Puente Alto,29,0.851613,1305.0,206.0,0.157854,C2,1.0
4,9538837257,0,0,0,0,0,0,-33.621177,-70.586392,Puente Alto,29,0.749758,1133.0,741.0,0.654016,D,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271679,602912427,0,0,0,0,0,0,-33.327393,-70.624826,Colina,4,,,,,,
271680,602912428,0,0,0,0,0,0,-33.327083,-70.624610,Colina,4,,,,,,
271681,602914409,0,0,0,0,0,0,-33.330418,-70.627947,Colina,4,,,,,,
271682,602914410,0,0,0,0,0,0,-33.329770,-70.627731,Colina,4,,,,,,


In [46]:
df_extra['ave_gse_num'] = df_extra.ave_gse_num.fillna(-1)
df_extra = df_extra.fillna(0)


In [47]:
df_extra

Unnamed: 0,node_id,beautiful,boring,depressing,lively,safe,wealthy,lat,lon,division,division_num,prom_ismt,hogares,hog_40pct,pct_hog40p,ave_gse,ave_gse_num
0,9565110273,0,0,0,0,0,0,-33.600032,-70.591930,Puente Alto,29,0.851613,1305.0,206.0,0.157854,C2,1.0
1,9565110274,0,0,0,0,0,0,-33.599970,-70.591940,Puente Alto,29,0.851613,1305.0,206.0,0.157854,C2,1.0
2,2393618954,0,0,0,0,0,0,-33.580565,-70.605937,Puente Alto,29,0.778635,1336.0,683.0,0.511228,D,3.0
3,9565110275,0,0,0,0,0,0,-33.600001,-70.591939,Puente Alto,29,0.851613,1305.0,206.0,0.157854,C2,1.0
4,9538837257,0,0,0,0,0,0,-33.621177,-70.586392,Puente Alto,29,0.749758,1133.0,741.0,0.654016,D,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271679,602912427,0,0,0,0,0,0,-33.327393,-70.624826,Colina,4,0.000000,0.0,0.0,0.000000,0,-1.0
271680,602912428,0,0,0,0,0,0,-33.327083,-70.624610,Colina,4,0.000000,0.0,0.0,0.000000,0,-1.0
271681,602914409,0,0,0,0,0,0,-33.330418,-70.627947,Colina,4,0.000000,0.0,0.0,0.000000,0,-1.0
271682,602914410,0,0,0,0,0,0,-33.329770,-70.627731,Colina,4,0.000000,0.0,0.0,0.000000,0,-1.0


In [48]:
for _, row in tqdm(df_extra.iterrows()):
    G.add_node(row.node_id, beautiful=row.beautiful, boring=row.boring,
               depressing=row.depressing, lively=row.lively,
               safe=row.safe, wealthy=row.wealthy, division=row.division_num, ismt=row.prom_ismt,
               houses=row.hogares, hog_40pct=row.hog_40pct, pct_hog40p=row.pct_hog40p, ave_gse= row.ave_gse_num)

271684it [00:21, 12819.64it/s]


In [31]:
nodes_gse = {}
nodes_without_label = []
for node, att in G.nodes(data=True):
    gse = att['ave_gse']
    if gse not in nodes_gse:
        nodes_gse[gse] = 0
    nodes_gse[gse] += 1
    if gse == '-1.0':
        nodes_without_label.append(node)
nodes_gse

{'1.0': 63825,
 '1': 17667,
 '2.0': 56542,
 '3': 38340,
 '0.0': 40026,
 '0': 9517,
 '3.0': 91886,
 '2': 18574,
 '-1.0': 18562,
 '4.0': 843,
 '4': 154}

In [32]:
print(f'Nodes to remove: {len(nodes_without_label)}')
G.remove_nodes_from(nodes_without_label)

Nodes to remove: 18562


In [33]:
nodes_ls = list(G.nodes())
c=0
dic_nodes = {}
for node in nodes_ls:
    dic_nodes[node] = c
    c+=1

G = nx.relabel_nodes(G, dic_nodes)

In [7]:
get_nodes_attrs(G)

{'y': 337374,
 'x': 337374,
 'beautiful': 337374,
 'boring': 337374,
 'depressing': 337374,
 'lively': 337374,
 'safe': 337374,
 'wealthy': 337374,
 'division': 337374,
 'ismt': 337374,
 'houses': 337374,
 'hog_40pct': 337374,
 'pct_hog40p': 337374,
 'ave_gse': 337374}

In [35]:
#ox.save_graphml(G, "Maps/santiago_drive_zero.graphml")

In [5]:
G = ox.load_graphml("Maps/santiago_drive_zero.graphml")

In [37]:
createPytorchData(G, file_name='santiago_zero_ismt')

Number of nodes: 337374
Number of edges: 641693
Number of edge attributes: 641693
Size of x tensor: torch.Size([337374, 14])
Size of edge_index tensor: torch.Size([2, 641693])
Size of edge_attributes tensor: torch.Size([641693, 2])
Graph saved as PyTorch in Data/santiago_zero_ismt.pt


## Filter Divisions

In [7]:
numbers_commune_filter = []
divisions = ['Santiago', 'Providencia', 'Ñuñoa', 'San Joaquín', 'San Miguel', 'Pedro Aguirre Cerda', 'Estación Central', 'Quinta Normal', 'Independencia', 'Recoleta']
for elem in divisions:
    numbers_commune_filter.append(div2num[elem])

numbers_commune_filter

[39, 27, 42, 35, 37, 23, 7, 31, 9, 32]

In [19]:
nodes_to_remove = []
for node, attrs in G.nodes(data=True):
    if int(attrs['division']) not in numbers_commune_filter:
        nodes_to_remove.append(node)

print(f"Nodes reduced from {len(list(G.nodes()))} to {len(list(G.nodes())) - len(nodes_to_remove)}")
G.remove_nodes_from(nodes_to_remove)
len(list(G.nodes()))

Nodes reduced from 337374 to 56766


56766

In [22]:
get_nodes_attrs(G)

{'y': 56766,
 'x': 56766,
 'beautiful': 56766,
 'boring': 56766,
 'depressing': 56766,
 'lively': 56766,
 'safe': 56766,
 'wealthy': 56766,
 'division': 56766,
 'ismt': 56766,
 'houses': 56766,
 'hog_40pct': 56766,
 'pct_hog40p': 56766,
 'ave_gse': 56766}

In [23]:
get_edges_attrs(G)

{'length': 96737, 'travel_time': 96737}

In [26]:
nodes_ls = list(G.nodes())
c=0
dic_nodes = {}
for node in nodes_ls:
    dic_nodes[node] = c
    c+=1

G = nx.relabel_nodes(G, dic_nodes)

In [27]:
#ox.save_graphml(G, "Maps/santiago_downtown.graphml")

In [28]:
createPytorchData(G, file_name='santiago_downtown_ismt')

Number of nodes: 56766
Number of edges: 96737
Number of edge attributes: 96737
Size of x tensor: torch.Size([56766, 14])
Size of edge_index tensor: torch.Size([2, 96737])
Size of edge_attributes tensor: torch.Size([96737, 2])
Graph saved as PyTorch in Data/santiago_downtown_ismt.pt


## Mean Global

In [35]:
# Setear valores de nodos extras en media global
ls_nodos_santiago = list(nodos_santiago.node_id.unique())
extra_nodes = list(set(G.nodes()) - set(ls_nodos_santiago))

beautiful_values = [nodos_santiago['beautiful'].median()] * len(extra_nodes)
boring_values = [nodos_santiago['boring'].median()] * len(extra_nodes)
depressing_values = [nodos_santiago['depressing'].median()] * len(extra_nodes)
lively_values = [nodos_santiago['lively'].median()] * len(extra_nodes)
safe_values = [nodos_santiago['safe'].median()] * len(extra_nodes)
wealthy_values = [nodos_santiago['wealthy'].median()] * len(extra_nodes)

columns_name = ['node_id', 'beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy']
df_extra = pd.DataFrame(list(zip(extra_nodes, beautiful_values, boring_values, depressing_values,
                                 lively_values, safe_values, wealthy_values)), columns=columns_name)


df_extra = geoDataProcess(df_extra)

In [37]:
for _, row in tqdm(df_extra.iterrows()):
    G.add_node(row.node_id, beautiful=row.beautiful, boring=row.boring,
               depressing=row.depressing, lively=row.lively,
               safe=row.safe, wealthy=row.wealthy, comuna=row.comuna_num)

268038it [00:18, 14178.12it/s]


In [38]:
ox.save_graphml(G, "Maps/santiago_drive_global_mean.graphml")

In [39]:
createPytorchData(G, file_name='santiago_global_mean')

Number of nodes: 355936
Number of edges: 673565
Number of edge attributes: 673565
Size of x tensor: torch.Size([355936, 9])
Size of edge_index tensor: torch.Size([2, 673565])
Size of edge_attributes tensor: torch.Size([673565, 2])
Graph saved as PyTorch in Data/santiago_global_mean.pt
