# Dependencias

Instalación de la última versión de Spark

In [None]:
!wget -q https://dlcdn.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz

In [None]:
!tar xf spark-3.4.1-bin-hadoop3.tgz

In [None]:
!rm spark-3.4.1-bin-hadoop3.tgz

In [None]:
!mv spark-3.4.1-bin-hadoop3 spark

Instalación de las librerías necesarias

In [None]:
!pip install pyspark osmnx h3 networkit igraph

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting osmnx
  Downloading osmnx-1.6.0-py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.4/101.4 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h3
  Downloading h3-3.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting networkit
  Downloading networkit-10.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m87.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting igraph
  Downloading igraph-0.10.6-cp39-abi3-manylinux_2_17_x86_64.manylinu

# Librerías

Importamos todas las librerías necesarias y configuramos las variables de entorno para que funcione spark

In [1]:
import os
import h3
import networkit as nk
import osmnx as ox
import networkx as nx
import numpy as np
import pandas as pd
import geopandas as gpd
import igraph as ig
from shapely import wkt
from geopy.distance import geodesic
from scipy.stats import gmean
from functools import reduce
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, LongType, ArrayType
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Ruta de instalación de Java
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

# Ruta de instalación de Spark
os.environ["SPARK_HOME"] = "/content/spark"

# Añadir Spark a la variable de entorno PATH
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["SPARK_HOME"] + "/bin:" + os.environ["PATH"]

# Construcción del grafo

Se descarga el grafo de la red vial de LIMA y CALLAO directamente de Open Street Maps filtrando con el shape de dichas provincias. Adicionalmente se especifíca la velocidad de cada tipo de vía para calcular el tiempo de traslado.

In [21]:
nodes = pd.read_csv("data/external/GrafoLima_nodes.csv", index_col=[0])
nodes["geometry"] = nodes["geometry"].apply(wkt.loads)
nodes = gpd.GeoDataFrame(nodes, crs="epsg:4326")[["y", "x", "geometry"]]
nodes

Unnamed: 0_level_0,y,x,geometry
osmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
258066779,-12.371360,-76.784678,POINT (-76.78468 -12.37136)
258066780,-12.376318,-76.779559,POINT (-76.77956 -12.37632)
258066781,-12.384553,-76.771046,POINT (-76.77105 -12.38455)
258066783,-12.385783,-76.769768,POINT (-76.76977 -12.38578)
258067237,-12.304254,-76.840727,POINT (-76.84073 -12.30425)
...,...,...,...
11074106301,-11.822343,-77.161338,POINT (-77.16134 -11.82234)
11074106303,-11.821708,-77.161237,POINT (-77.16124 -11.82171)
11074106305,-11.821329,-77.161223,POINT (-77.16122 -11.82133)
11074106311,-11.823040,-77.150128,POINT (-77.15013 -11.82304)


In [22]:
edges = pd.read_csv("data/external/GrafoLima_edgeswaze.csv", index_col=[0,1,2])
edges["geometry"] = edges["geometry"].apply(wkt.loads)
edges = gpd.GeoDataFrame(edges, crs="epsg:4326")[["highway", "length", "wazeinfo_minutes", "geometry"]]
edges.rename(columns={"wazeinfo_minutes": "travel_time"}, inplace=True)
edges.loc[edges["travel_time"] < 0, "travel_time"] = 0
edges

  edges = pd.read_csv("data/external/GrafoLima_edgeswaze.csv", index_col=[0,1,2])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,highway,length,travel_time,geometry
u,v,key,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
258066779,10985400003,0,motorway_link,109.492,0.050000,"LINESTRING (-76.78468 -12.37136, -76.78466 -12..."
258066779,258066780,0,motorway,782.922,0.450000,"LINESTRING (-76.78468 -12.37136, -76.78433 -12..."
258066780,258066781,0,trunk,1301.315,0.033333,"LINESTRING (-76.77956 -12.37632, -76.77670 -12..."
258066781,258066783,0,trunk,194.871,1.750000,"LINESTRING (-76.77105 -12.38455, -76.76977 -12..."
258066781,495302399,0,trunk_link,85.154,0.100000,"LINESTRING (-76.77105 -12.38455, -76.77099 -12..."
...,...,...,...,...,...,...
11074106305,11074106303,0,residential,42.177,0.550000,"LINESTRING (-77.16122 -11.82133, -77.16124 -11..."
11074106311,5497647609,0,residential,126.496,0.233333,"LINESTRING (-77.15013 -11.82304, -77.15002 -11..."
11074106403,1273703683,0,residential,51.814,0.233333,"LINESTRING (-77.16524 -11.77607, -77.16571 -11..."
11074106403,11001190815,0,residential,61.585,0.083333,"LINESTRING (-77.16524 -11.77607, -77.16521 -11..."


In [23]:
graph_osm = ox.graph_from_gdfs(gdf_nodes=nodes, gdf_edges=edges)

# Calculo probabilidades

In [24]:
spark = SparkSession.builder.appName("HuFFz").getOrCreate()

Leemos los orígenes y destinos. Los orígenes se obtuvieron de los nodos más cercanos a los centroides de los hexagonos H3 de resolución 8, de igual forma, los destinos se obtuvieron de los nodos más cercanos a las facilidades de salud más cercanas

In [25]:
origines_schema = StructType([
    StructField("osmid_src", LongType(), True),
    StructField("lon_src", DoubleType(), True),
    StructField("lat_src", DoubleType(), True),
    StructField("population_orig", DoubleType(), True)
])
origins = pd.read_csv("./data/outputs/origins.csv")
origins.rename(columns={"osmid": "osmid_src", "lat": "lat_src", "lon": "lon_src"}, inplace=True)
origins = spark.createDataFrame(origins, schema=origines_schema)
origins.show(10)

+----------+-----------+-----------+---------------+
| osmid_src|    lon_src|    lat_src|population_orig|
+----------+-----------+-----------+---------------+
| 258067461|-76.9191087|-12.2576067|         1330.0|
| 495229959|-76.8713124|-12.2142518|         1353.0|
| 495280445|-76.7525382|-12.3932171|          213.0|
| 513858708|-76.8613233|-12.2823508|          939.0|
| 799255264| -76.774448|-12.4007079|          235.0|
| 914288043|-76.8525222|-12.2124228|         2605.0|
| 914296528|-76.8809431|-12.2552582|         1722.0|
| 914296867|-76.8631655|-12.2357146|         1631.0|
|1273471160|-76.9081893|-12.2179438|        20835.0|
|1273700338|-76.7756755|-12.3826749|         1649.0|
+----------+-----------+-----------+---------------+
only showing top 10 rows



In [26]:
destinos_schema = StructType([
    StructField("CO_IPRESS", StringType(), True),
    StructField("NOMBRE", StringType(), True),
    StructField("DEPARTAMENTO", StringType(), True),
    StructField("PROVINCIA", StringType(), True),
    StructField("DISTRITO", StringType(), True),
    StructField("CATEGORIA", StringType(), True),
    StructField("CA_CONSULTORIOS", IntegerType(), True),
    StructField("CA_CONSULTORIOS_FN", IntegerType(), True),
    StructField("CA_CAMAS", IntegerType(), True),
    StructField("CA_MEDICOS_TOTAL", IntegerType(), True),
    StructField("attended_people", IntegerType(), True),
    StructField("age", IntegerType(), True),
    StructField("hex_id", StringType(), True),
    StructField("population_dest", IntegerType(), True),
    StructField("osmid_dst", LongType(), True),
    StructField("lon_dst", DoubleType(), True),
    StructField("lat_dst", DoubleType(), True)
])
destinations = pd.read_csv("./data/outputs/destinations_hospitals.csv", dtype={
    "CO_IPRESS": str,
    "CA_CONSULTORIOS": int,
    "CA_CONSULTORIOS_FN": int,
    "CA_CAMAS": int,
    "CA_MEDICOS_TOTAL": int,
    "attended_people": int,
    "population_dest": int
    })
destinations.rename(columns={"osmid": "osmid_dst", "lat": "lat_dst", "lon": "lon_dst"}, inplace=True)
destinations = spark.createDataFrame(destinations, schema=destinos_schema)
destinations.show(10)

+---------+--------------------+------------+---------+--------------------+---------+---------------+------------------+--------+----------------+---------------+---+---------------+---------------+----------+-----------+-----------+
|CO_IPRESS|              NOMBRE|DEPARTAMENTO|PROVINCIA|            DISTRITO|CATEGORIA|CA_CONSULTORIOS|CA_CONSULTORIOS_FN|CA_CAMAS|CA_MEDICOS_TOTAL|attended_people|age|         hex_id|population_dest| osmid_dst|    lon_dst|    lat_dst|
+---------+--------------------+------------+---------+--------------------+---------+---------------+------------------+--------+----------------+---------------+---+---------------+---------------+----------+-----------+-----------+
| 00005946|HOSPITAL NACIONAL...|        LIMA|     LIMA|         EL AGUSTINO|    III-1|             91|               110|     592|             513|          88114| 74|888e62c291fffff|          19757|1273969287| -76.992431| -12.039887|
| 00005947|HOSPITAL DE MEDIA...|        LIMA|     LIMA|     

Definimos la función que calcula la distancia geegráfica entre dos puntos para poder filtrar según un radio establecido



In [27]:
def get_distance(lat_src, lon_src, lat_dst, lon_dst):
    return geodesic((lat_src, lon_src), (lat_dst, lon_dst)).kilometers

get_distance_udf = F.udf(get_distance, DoubleType())

Calculamos la distancia geográfica desde cada origen a cada destino y los filtramos

In [28]:
pairs = origins.crossJoin(F.broadcast(destinations)) \
    .withColumn("distance_geodesic", get_distance_udf("lat_src", "lon_src", "lat_dst", "lon_dst")) \
    .select("osmid_src", "osmid_dst", "lat_src", "lon_src", "lat_dst", "lon_dst", "attended_people", "distance_geodesic", "CA_CONSULTORIOS_FN", "CA_CAMAS", "CA_MEDICOS_TOTAL", "age", "population_orig", "population_dest") \
    .filter(F.col("distance_geodesic") <= 5)

In [29]:
pairs = pairs.toPandas()
pairs

Unnamed: 0,osmid_src,osmid_dst,lat_src,lon_src,lat_dst,lon_dst,attended_people,distance_geodesic,CA_CONSULTORIOS_FN,CA_CAMAS,CA_MEDICOS_TOTAL,age,population_orig,population_dest
0,258067461,7096674239,-12.257607,-76.919109,-12.232415,-76.933526,148085,3.197985,56,356,449,7,1330.0,9595
1,1273471160,3792763316,-12.217944,-76.908189,-12.210672,-76.931792,126707,2.691349,54,64,102,23,20835.0,13764
2,1273471160,7096674239,-12.217944,-76.908189,-12.232415,-76.933526,148085,3.187995,56,356,449,7,20835.0,9595
3,1273731718,3792763316,-12.218726,-76.917551,-12.210672,-76.931792,126707,1.787505,54,64,102,23,10199.0,13764
4,1273731718,7096674239,-12.218726,-76.917551,-12.232415,-76.933526,148085,2.305338,56,356,449,7,10199.0,9595
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6115,11001048613,1767758594,-11.873915,-77.081176,-11.872936,-77.125156,19063,4.793018,25,90,87,15,9032.0,7751
6116,11001048613,1845880575,-11.873915,-77.081176,-11.863228,-77.079377,54265,1.198287,12,108,136,51,9032.0,17729
6117,11001477409,1845880575,-11.840183,-77.064551,-11.863228,-77.079377,54265,3.018022,12,108,136,51,3907.0,17729
6118,11028285175,1767758594,-11.853002,-77.113596,-11.872936,-77.125156,19063,2.539487,25,90,87,15,4925.0,7751


In [30]:
pairs.to_csv("./data/steps/pairs.csv", index=False)

In [31]:
pairs_schema = StructType([
    StructField("osmid_src", LongType(), True),
    StructField("osmid_dst", LongType(), True),
    StructField("lat_src", DoubleType(), True),
    StructField("lon_src", DoubleType(), True),
    StructField("lat_dst", DoubleType(), True),
    StructField("lon_dst", DoubleType(), True),
    StructField("attended_people", IntegerType(), True),
    StructField("distance_geodesic", DoubleType(), True),
    StructField("CA_CONSULTORIOS_FN", IntegerType(), True),
    StructField("CA_CAMAS", IntegerType(), True),
    StructField("CA_MEDICOS_TOTAL", IntegerType(), True),
    StructField("age", IntegerType(), True),
    StructField("population_orig", IntegerType(), True),
    StructField("population_dest", IntegerType(), True)
])
pairs = pd.read_csv("./data/steps/pairs.csv", dtype={
    "osmid_src": int,
    "osmid_dst": int,
    "lat_src": float,
    "lon_src": float,
    "lat_dst": float,
    "lon_dst": float,
    "attended_people": int,
    "distance_geodesic": float,
    "CA_CONSULTORIOS_FN": int,
    "CA_CAMAS": int,
    "CA_MEDICOS_TOTAL": int,
    "attended_people": int,
    "population_orig": int,
    "population_dest": int
})
pairs = spark.createDataFrame(pairs, schema=pairs_schema)
pairs.show(10)

+----------+----------+-----------+-----------+-----------+-----------+---------------+------------------+------------------+--------+----------------+---+---------------+---------------+
| osmid_src| osmid_dst|    lat_src|    lon_src|    lat_dst|    lon_dst|attended_people| distance_geodesic|CA_CONSULTORIOS_FN|CA_CAMAS|CA_MEDICOS_TOTAL|age|population_orig|population_dest|
+----------+----------+-----------+-----------+-----------+-----------+---------------+------------------+------------------+--------+----------------+---+---------------+---------------+
| 258067461|7096674239|-12.2576067|-76.9191087|-12.2324148| -76.933526|         148085|3.1979854521143207|                56|     356|             449|  7|           1330|           9595|
|1273471160|3792763316|-12.2179438|-76.9081893| -12.210672|-76.9317917|         126707| 2.691349399968469|                54|      64|             102| 23|          20835|          13764|
|1273471160|7096674239|-12.2179438|-76.9081893|-12.2324148| 

Definimos la función que calcula el camino más corto entre un origen y un destino respecto a una propiedad que en este caso es el tiempo de traslado

In [32]:
return_schema = StructType([
    StructField("total_travel_time", DoubleType(), True),
    StructField("path", ArrayType(LongType()), True)
])

In [33]:
def nx_dijkstra(origin, destination, graph):
    try:
        distance, path = nx.single_source_dijkstra(
            G=graph,
            source=origin,
            target=destination,
            weight="travel_time"
        )
        return (distance, path)
    except Exception as e:
        print(e)
        return (None, None)

In [34]:
nx_dijkstra_udf = F.udf(
    lambda orig, dest: nx_dijkstra(
        origin=orig,
        destination=dest,
        graph=graph_osm
        ),
    returnType=return_schema
)

Calculamos el camino más corto respecto al tiempo de traslado

In [35]:
pairs = pairs.withColumn("dijkstra", nx_dijkstra_udf(F.col("osmid_src"), F.col("osmid_dst")))
pairs = pairs.withColumn("total_travel_time", pairs["dijkstra"].getItem("total_travel_time"))
pairs = pairs.withColumn("path", pairs["dijkstra"].getItem("path"))

Construimos las probabilidades de traslado utilizando el modelo de Huff con una dimensión de atractividad que en este caso es la demanda de pacientes y el tiempo de traslado

In [36]:
@F.pandas_udf(DoubleType(), F.PandasUDFType.GROUPED_AGG)
def gmean_udf(values) -> DoubleType():
    return gmean(values)

window_spec = Window.partitionBy("osmid_src")


probabilities = pairs.withColumn("people_travel_time", F.col("attended_people") / F.col("total_travel_time"))
probabilities = probabilities.withColumn("sum_people_travel_time", F.sum("people_travel_time").over(window_spec))
probabilities = probabilities.withColumn("p", 100 * (F.col("people_travel_time") / F.col("sum_people_travel_time")))



Transformamos cada una de las variables según el modelo de Huff para su entrenamiento. Los parámetros resultantes se utilizaron para estimar las probabilidades anteriormente construidas

# Modelo de Huff

In [37]:
probabilities = probabilities.withColumn("A-p", F.log((F.col("p") / gmean_udf(F.col("p")).over(window_spec))))
probabilities = probabilities.withColumn("A-distance", F.log((F.col("distance_geodesic") / gmean_udf(F.col("distance_geodesic")).over(window_spec))))
probabilities = probabilities.withColumn("A-CA_CONSULTORIOS_FN", F.log((F.col("CA_CONSULTORIOS_FN") / gmean_udf(F.col("CA_CONSULTORIOS_FN")).over(window_spec))))
probabilities = probabilities.withColumn("A-CA_CAMAS", F.log((F.col("CA_CAMAS") / gmean_udf(F.col("CA_CAMAS")).over(window_spec))))
probabilities = probabilities.withColumn("A-CA_MEDICOS_TOTAL", F.log((F.col("CA_MEDICOS_TOTAL") / gmean_udf(F.col("CA_MEDICOS_TOTAL")).over(window_spec))))
probabilities = probabilities.withColumn("A-age", F.log((F.col("age") / gmean_udf(F.col("age")).over(window_spec))))
probabilities = probabilities.withColumn("A-population", F.log((F.col("population_dest") / gmean_udf(F.col("population_dest")).over(window_spec))))

Retornamos el resultado del proceso de Spark a un dataframe de Pandas para utilizarlo en el entrenamiento del modelo de Huff

In [38]:
train = probabilities.toPandas()
train = train[~(train["A-distance"].isnull())]
train = train[~(train["A-p"].isnull())]

In [39]:
X = train[["A-distance", "A-CA_CONSULTORIOS_FN", "A-CA_CAMAS", "A-CA_MEDICOS_TOTAL", "A-age", "A-population"]].copy()
Y = train["A-p"].copy()

In [40]:
ridge = Ridge(alpha=0.05)
ridge.fit(X, Y)
pred_rigde = ridge.predict(X)
print("---RMSE---")
print(np.sqrt(mean_squared_error(Y, pred_rigde)))
print("---R2---")
print(r2_score(Y, pred_rigde))
print("---INTERCEPT---")
print(ridge.coef_)
print("---COEFFICIENTS---")
print(ridge.intercept_)

---RMSE---
0.8704170889861899
---R2---
0.5193809386367707
---INTERCEPT---
[-0.53630481  0.34883061 -0.01368363  0.64900845  0.2397081   0.00570233]
---COEFFICIENTS---
-2.0996871464320993e-17


Una vez estimados los parámetros del modelo de Huff, los utilizamos para calcular el nivel de accesibilidad para cada uno de los orígenes

# Accesibilidad

In [41]:
accesibility = train.copy()
accesibility

Unnamed: 0,osmid_src,osmid_dst,lat_src,lon_src,lat_dst,lon_dst,attended_people,distance_geodesic,CA_CONSULTORIOS_FN,CA_CAMAS,...,people_travel_time,sum_people_travel_time,p,A-p,A-distance,A-CA_CONSULTORIOS_FN,A-CA_CAMAS,A-CA_MEDICOS_TOTAL,A-age,A-population
0,108013939,392151331,-12.104828,-77.038988,-12.111820,-76.999323,147617,4.386609,71,443,...,201295.909091,6.671009e+06,3.017473,0.816900,0.545636,0.250658,1.557100,0.305445,0.680830,-0.044557
1,108013939,263635944,-12.104828,-77.038988,-12.128576,-77.018056,17553,3.477579,9,23,...,40506.923077,6.671009e+06,0.607208,-0.786403,0.313416,-1.814797,-1.400975,0.219224,0.023189,-0.102674
2,108013939,2807340550,-12.104828,-77.038988,-12.090280,-77.017664,155267,2.824694,315,186,...,93160.200000,6.671009e+06,1.396493,0.046444,0.105479,1.740551,0.689277,0.985463,0.133190,0.228400
3,108013939,10092527737,-12.104828,-77.038988,-12.130459,-77.019539,295,3.538541,20,19,...,680.769231,6.671009e+06,0.010205,-4.872408,0.330794,-1.016289,-1.592030,-1.765338,-1.029961,0.021054
4,108013939,874949943,-12.104828,-77.038988,-12.073401,-77.059096,183171,4.108309,123,121,...,646485.882353,6.671009e+06,9.690976,1.983675,0.480091,0.800163,0.259321,-0.678287,-0.904798,0.302863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6115,11066552216,4356245534,-11.943458,-77.016601,-11.966359,-77.002921,188019,2.939031,82,145,...,9300.197857,4.136534e+04,22.483066,0.517053,-0.294519,0.359305,0.493713,0.385921,0.315053,-0.033307
6116,11066552216,5714269603,-11.943458,-77.016601,-11.913937,-77.039563,58675,4.113478,35,311,...,4186.087990,4.136534e+04,10.119795,-0.281216,0.041670,-0.492066,1.256773,0.750564,0.883746,-0.214872
6117,11066552216,824306498,-11.943458,-77.016601,-11.968814,-76.994887,212705,3.668996,93,43,...,10478.078818,4.136534e+04,25.330573,0.636302,-0.072681,0.485186,-0.721820,0.269388,-0.399600,-0.033307
6118,11066552216,1273122697,-11.943458,-77.016601,-11.951614,-77.059758,10876,4.786563,32,25,...,773.175355,4.136534e+04,1.869138,-1.970232,0.193214,-0.581678,-1.264144,-1.821354,-0.399600,0.330654


In [42]:
accesibility["u"] = (accesibility["distance_geodesic"] ** -0.68297695) * \
(accesibility["CA_CONSULTORIOS_FN"] ** 0.48725242) * \
(accesibility["CA_CAMAS"] ** 0.00607136) * \
(accesibility["CA_MEDICOS_TOTAL"] ** 0.52974778) * \
(accesibility["age"] ** 0.16048599) * \
(accesibility["population_dest"] ** 0.41485212)
accesibility["p"] = accesibility["u"] / accesibility.groupby("osmid_src")["u"].transform(sum)
accesibility["t*p"] = accesibility["total_travel_time"] * accesibility["p"]
final_accesibility = accesibility.groupby("osmid_src", as_index=False).agg(
    lat = ("lat_src", "first"),
    lon = ("lon_src", "first"),
    accesibility = ("t*p", "sum")
)
final_accesibility["hex_id"] = final_accesibility.apply(lambda row: h3.geo_to_h3(row["lat"], row["lon"], 8), axis=1)
final_accesibility = final_accesibility.groupby("hex_id", as_index=False).agg(
    accesibility = ("accesibility", "mean")
)
accesibility.to_csv("./data/outputs/raw_accesibility.csv", index=False)
final_accesibility.to_csv("./data/outputs/accesibility.csv", index=False)

In [44]:
final_accesibility

Unnamed: 0,hex_id,accesibility
0,888e621125fffff,4.333333
1,888e62112dfffff,5.616667
2,888e621165fffff,6.166667
3,888e621a01fffff,0.512325
4,888e621a03fffff,0.780770
...,...,...
965,888e75a493fffff,6.016667
966,888e75a497fffff,6.350000
967,888e75a49bfffff,5.966667
968,888e75a4b3fffff,6.400000


# Criticidad

In [45]:
accesibility["criticity"] = accesibility["p"] * accesibility["population_orig"]
accesibility

Unnamed: 0,osmid_src,osmid_dst,lat_src,lon_src,lat_dst,lon_dst,attended_people,distance_geodesic,CA_CONSULTORIOS_FN,CA_CAMAS,...,A-p,A-distance,A-CA_CONSULTORIOS_FN,A-CA_CAMAS,A-CA_MEDICOS_TOTAL,A-age,A-population,u,t*p,criticity
0,108013939,392151331,-12.104828,-77.038988,-12.111820,-76.999323,147617,4.386609,71,443,...,0.816900,0.545636,0.250658,1.557100,0.305445,0.680830,-0.044557,4979.905873,0.018110,124.116942
1,108013939,263635944,-12.104828,-77.038988,-12.128576,-77.018056,17553,3.477579,9,23,...,-0.786403,0.313416,-1.814797,-1.400975,0.219224,0.023189,-0.102674,1758.264322,0.003778,43.822192
2,108013939,2807340550,-12.104828,-77.038988,-12.090280,-77.017664,155267,2.824694,315,186,...,0.046444,0.105479,1.740551,0.689277,0.985463,0.133190,0.228400,20334.094623,0.168059,506.797859
3,108013939,10092527737,-12.104828,-77.038988,-12.130459,-77.019539,295,3.538541,20,19,...,-4.872408,0.330794,-1.016289,-1.592030,-1.765338,-1.029961,0.021054,795.621712,0.001710,19.829719
4,108013939,874949943,-12.104828,-77.038988,-12.073401,-77.059096,183171,4.108309,123,121,...,1.983675,0.480091,0.800163,0.259321,-0.678287,-0.904798,0.302863,3591.506986,0.005046,89.513110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6115,11066552216,4356245534,-11.943458,-77.016601,-11.966359,-77.002921,188019,2.939031,82,145,...,0.517053,-0.294519,0.359305,0.493713,0.385921,0.315053,-0.033307,7069.527942,6.347924,200.014556
6116,11066552216,5714269603,-11.943458,-77.016601,-11.913937,-77.039563,58675,4.113478,35,311,...,-0.281216,0.041670,-0.492066,1.256773,0.750564,0.883746,-0.214872,4595.595617,2.861003,130.020848
6117,11066552216,824306498,-11.943458,-77.016601,-11.968814,-76.994887,212705,3.668996,93,43,...,0.636302,-0.072681,0.485186,-0.721820,0.269388,-0.399600,-0.033307,5375.296266,4.846522,152.080521
6118,11066552216,1273122697,-11.943458,-77.016601,-11.951614,-77.059758,10876,4.786563,32,25,...,-1.970232,0.193214,-0.581678,-1.264144,-1.821354,-0.399600,0.330654,1020.725590,0.637723,28.878870


In [46]:
def criticity_sub_df(path, graph, criticity):
  try:
    split_df = ox.utils_graph.route_to_gdf(graph, path, weight="travel_time").reset_index()
    split_df = split_df[["u", "v"]].copy()
    split_df.rename(columns={"u": "osmid_src", "v": "osmid_dst"}, inplace=True)
    split_df["criticity"] = criticity
    return split_df
  except Exception as e:
    print(e)
    return pd.DataFrame(columns=["osmid_src", "osmid_dst", "criticity"])

In [47]:
dfs = [criticity_sub_df(record["path"], graph_osm, record["criticity"]) for record in accesibility.to_dict("records")]

In [48]:
criticity_info = pd.concat(dfs).groupby(["osmid_src", "osmid_dst"], as_index=False).agg(
    criticity = ("criticity", "sum")
)
criticity_info

Unnamed: 0,osmid_src,osmid_dst,criticity
0,31035109,31035110,31828.928866
1,31035110,31035112,31828.928866
2,31035112,7566583637,130740.621878
3,31035113,31035116,61731.487287
4,31035116,7566583638,93560.416154
...,...,...,...
27448,11070306493,1709362627,1199.000000
27449,11070724969,8164286912,636.000000
27450,11072576593,5560075044,1833.000000
27451,11072599905,5560075044,1241.000000


In [49]:
edges = ox.graph_to_gdfs(graph_osm, nodes=False, edges=True).reset_index()
edges = edges[["u", "v", "highway", "length", "travel_time", "geometry"]].copy()
edges.rename(columns={"u": "osmid_src", "v": "osmid_dst"}, inplace=True)
edges = edges.merge(criticity_info, on=["osmid_src", "osmid_dst"], how="left")
edges = edges[~(edges["criticity"].isnull())].copy()
edges["score"] = 100 * (edges["criticity"] - edges["criticity"].min()) / (edges["criticity"].max() - edges["criticity"].min())
edges

Unnamed: 0,osmid_src,osmid_dst,highway,length,travel_time,geometry,criticity,score
16,258067461,1245524078,motorway_link,371.796,0.016667,"LINESTRING (-76.91911 -12.25761, -76.91954 -12...",1330.0,0.223979
18,1245524078,1245524125,motorway_link,79.942,0.166667,"LINESTRING (-76.92081 -12.25505, -76.92074 -12...",8253.0,1.399574
32,266954089,1273847208,primary,12.635,0.000000,"LINESTRING (-76.91949 -12.25429, -76.91938 -12...",8253.0,1.399574
36,1273847208,1273893332,primary,623.466,1.066667,"LINESTRING (-76.91938 -12.25431, -76.91887 -12...",8253.0,1.399574
45,4284575917,4060113093,primary,1154.584,1.550000,"LINESTRING (-76.96710 -12.22979, -76.96732 -12...",2978.0,0.503826
...,...,...,...,...,...,...,...,...
395198,11001048613,11001048614,residential,14.522,0.000000,"LINESTRING (-77.08118 -11.87391, -77.08128 -11...",9032.0,1.531856
395251,11001453757,11001453799,residential,47.840,0.183333,"LINESTRING (-77.06527 -11.84027, -77.06540 -11...",3907.0,0.661580
395254,11001477410,11001453757,residential,48.023,0.183333,"LINESTRING (-77.06517 -11.83985, -77.06516 -11...",3907.0,0.661580
395285,11028285176,11028285177,residential,21.583,2.616667,"LINESTRING (-77.11358 -11.85339, -77.11338 -11...",4925.0,0.834446


In [50]:
edges.to_csv("./data/outputs/criticity.csv", index=False)

In [51]:
spark.stop()