# Measuring built environment

| Dimension | Aspect                 | Metric                                                       | Data source    | Progress |
|-----------|------------------------|--------------------------------------------------------------|----------------|----------|
| Node      | Access to destinations | Jobs                                                         | Jobs count     |          |
|           | Access to transit      | Transit stations density (/ hexagon area)                    | Open GTFS data | ✔        |
| Place     | Land use mix           | POI entropy                                                  | ?              |          |
|           | Building form          | Ground space index = Gross building footprint / hexagon area | Building data  | ✔        |
|           | Specific land use      | Public health care services                                  | Building data  | ✔        |
| Tie       | Pedestrian network     | Space syntax analysis: Street network detour                 | OpenStreetMap  |          |

Inspired by: Xiao, Longzhu, et al. "Predicting vibrancy of metro station areas considering spatial relationships through graph convolutional neural networks: The case of Shenzhen, China." Environment and Planning B: Urban Analytics and City Science 48.8 (2021): 2363-2384. [Link](https:///www.doi.org/10.1177/2399808320977866)

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [107]:
# Load libs
import pandas as pd
import geopandas as gpd
import fiona
from tqdm import tqdm
import routing_helpers as rhelpers
import preprocess
import sqlalchemy
from shapely.geometry import box

In [38]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}')

In [64]:
# Study area
hex_area = 36.129062164 # km^2
gdf_hex = gpd.read_file('results/mobi_seg_spatio_static.geojson')
gdf_hex = gdf_hex[['hex_id', 'geometry']].drop_duplicates(subset=['hex_id'])

## 1. Download GTFS data

In [29]:
regional_operators = ["blekinge", "dt", "dintur", "gotland", "halland", "jlt", "klt", "krono", "jamtland",
                      "norrbotten", "vasterbotten", "orebro", "skane", "sl", "sormland", "ul", "vastmanland",
                      "varm", "vt", "xt", "otraf", "sj"]
for reg_opt in tqdm(regional_operators, desc='Downloading GTFS data'):
    rhelpers.gtfs_downloader(region='sweden', user='yuan', region_operator=reg_opt, skip_country=False)

Downloading GTFS data:   0%|          | 0/22 [00:00<?, ?it/s]

Downloading latest GTFS data for sweden...


Downloading GTFS data: 100%|██████████| 22/22 [05:53<00:00, 16.09s/it]  


### 1.1 Clean up shapes.txt and dump to the database

In [31]:
df = pd.read_csv('dbs/gtfs_sweden_2023-02-06/sweden/shapes.txt', delimiter=',')
df.head()

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,1014010000482329256,59.341873,18.118316,1,0.0
1,1014010000482329256,59.341854,18.118403,2,0.0
2,1014010000482329256,59.341754,18.118334,3,11.74
3,1014010000482329256,59.34126,18.118048,4,69.17
4,1014010000482329256,59.34093,18.116765,5,150.88


In [39]:
df_l = df.groupby('shape_id').apply(lambda data: pd.Series(dict(count=len(data)))).reset_index()

ProgrammingError: (psycopg2.errors.UndefinedColumn) column "index" of relation "shapes" does not exist
LINE 1: INSERT INTO gtfs.shapes (index, shape_id, shape_pt_lat, shap...
                                 ^

[SQL: INSERT INTO gtfs.shapes (index, shape_id, shape_pt_lat, shape_pt_lon, shape_pt_sequence, shape_dist_traveled) VALUES (%(index)s, %(shape_id)s, %(shape_pt_lat)s, %(shape_pt_lon)s, %(shape_pt_sequence)s, %(shape_dist_traveled)s)]
[parameters: ({'index': 0, 'shape_id': 1014010000482329256, 'shape_pt_lat': 59.341873, 'shape_pt_lon': 18.118316, 'shape_pt_sequence': 1, 'shape_dist_traveled': 0.0}, {'index': 1, 'shape_id': 1014010000482329256, 'shape_pt_lat': 59.341854, 'shape_pt_lon': 18.118403, 'shape_pt_sequence': 2, 'shape_dist_traveled': 0.0}, {'index': 2, 'shape_id': 1014010000482329256, 'shape_pt_lat': 59.341754, 'shape_pt_lon': 18.118334, 'shape_pt_sequence': 3, 'shape_dist_traveled': 11.74}, {'index': 3, 'shape_id': 1014010000482329256, 'shape_pt_lat': 59.34126, 'shape_pt_lon': 18.118048, 'shape_pt_sequence': 4, 'shape_dist_traveled': 69.17}, {'index': 4, 'shape_id': 1014010000482329256, 'shape_pt_lat': 59.34093, 'shape_pt_lon': 18.116765, 'shape_pt_sequence': 5, 'shape_dist_traveled': 150.88}, {'index': 5, 'shape_id': 1014010000482329256, 'shape_pt_lat': 59.340944, 'shape_pt_lon': 18.116479, 'shape_pt_sequence': 6, 'shape_dist_traveled': 167.29}, {'index': 6, 'shape_id': 1014010000482329256, 'shape_pt_lat': 59.340572, 'shape_pt_lon': 18.115033, 'shape_pt_sequence': 7, 'shape_dist_traveled': 259.42}, {'index': 7, 'shape_id': 1014010000482329256, 'shape_pt_lat': 59.340453, 'shape_pt_lon': 18.11487, 'shape_pt_sequence': 8, 'shape_dist_traveled': 275.54}  ... displaying 10 of 56498931 total bound parameter sets ...  {'index': 56498940, 'shape_id': 1747400000000001242, 'shape_pt_lat': 57.536394, 'shape_pt_lon': 13.353887, 'shape_pt_sequence': 1, 'shape_dist_traveled': None}, {'index': 56498941, 'shape_id': 1747400000000001242, 'shape_pt_lat': 57.709053, 'shape_pt_lon': 11.973528, 'shape_pt_sequence': 2, 'shape_dist_traveled': None})]
(Background on this error at: http://sqlalche.me/e/f405)

In [41]:
df = df.loc[~df.shape_id.isin(df_l.loc[df_l['count'] == 1, 'shape_id']), :]
df.head()

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,1014010000482329256,59.341873,18.118316,1,0.0
1,1014010000482329256,59.341854,18.118403,2,0.0
2,1014010000482329256,59.341754,18.118334,3,11.74
3,1014010000482329256,59.34126,18.118048,4,69.17
4,1014010000482329256,59.34093,18.116765,5,150.88


In [43]:
df.to_sql(
    name="shapes",
    con=engine,
    schema="gtfs",
    index=False,
    method='multi',
    if_exists='append',
    chunksize=10000
)

### 1.2 Run SQL script to process the rest of GTFS data
`src\gtfs2database.sql`

## 2. Node features

In [45]:
stops = gpd.GeoDataFrame.from_postgis(sql="""SELECT stop_id, stop_name, stop_geom as geom FROM gtfs.stops;""", con=engine)

In [46]:
stops.head(10)

Unnamed: 0,stop_id,stop_name,geom
0,360,Lycksele Resecentrum,POINT (18.66999 64.59474)
1,361,Södra station,POINT (15.20300 59.26957)
2,362,Åseda terminal,POINT (15.34841 57.16818)
3,363,Veddige station,POINT (12.33673 57.26764)
4,364,Årjäng Busstationen,POINT (12.13137 59.39441)
5,365,Rimforsa station,POINT (15.68136 58.13593)
6,366,Mörrum station,POINT (14.74409 56.18690)
7,367,Malung centrum,POINT (13.71080 60.68370)
8,368,Bräkne Hoby,POINT (15.11563 56.23080)
9,370,Bovallstrand,POINT (11.32722 58.47404)


### 2.1 Access to transit
Transit stations density (# of transit stations / hexagon area).

In [61]:
stops_hex = gpd.sjoin(stops, gdf_hex)
stops_hex.head()

Unnamed: 0,stop_id,stop_name,geometry,index_right,hex_id
9,370,Bovallstrand,POINT (11.32722 58.47404),1601,86099a29fffffff
33523,35659,Svenseröd,POINT (11.31546 58.44559),1601,86099a29fffffff
33526,35662,Uleberg,POINT (11.31705 58.45078),1601,86099a29fffffff
33528,35665,Hunnebo sjukhem,POINT (11.31323 58.43859),1601,86099a29fffffff
33529,35666,Ulebergs hamn,POINT (11.30141 58.45119),1601,86099a29fffffff


In [66]:
stops_count = stops_hex.groupby('hex_id')['stop_id'].count().reset_index().rename(columns={'stop_id': 'num_stops'})
stops_count.loc[:, 'num_stops'] /= hex_area
stops_count.head()

Unnamed: 0,hex_id,num_stops
0,86088044fffffff,0.941071
1,860880497ffffff,1.660713
2,86088049fffffff,5.009817
3,8608804b7ffffff,1.107142
4,8608804d7ffffff,1.13482


## 3. Place features
### 3.1 Load and process building data

In [68]:
gdb_file = "dbs/buildings/buildings.gdb"
layers = fiona.listlayers(gdb_file)
for layer in layers:
    gdf_bd = gpd.read_file(gdb_file, layer=layer)
gdf_bd.head()

Unnamed: 0,OBJEKT_ID,OBJEKT_VER,DETALJTYP,ADAT,INSAM_LAGE,XYFEL,NAMN1,NAMN2,NAMN3,HUVUDBYGGN,...,ANDAMAL_5,ANDAMAL_6,ANDAMAL_7,ANDAMAL_8,ANDAMAL_9,ANDAMAL_10,HUSNR,Shape_Length,Shape_Area,geometry
0,a9d891a3-c8b1-4c06-b4b1-98bb87a74227,1,HUS,2013-04-05 15:46,4,0,,,,,...,0,0,0,0,0,0,0,40.000489,96.00291,"MULTIPOLYGON (((484523.880 6443262.612, 484517..."
1,c7480e52-7a02-4b6c-a4b4-4cf8c23b1cee,2,HUS,2014-08-05 09:29,4,0,,,,,...,0,0,0,0,0,0,0,44.269637,113.079799,"MULTIPOLYGON (((512324.935 6410856.541, 512332..."
2,dddf8f86-11c6-4523-9364-819a86490a33,2,HUS,2015-06-16 13:45,4,0,,,,,...,0,0,0,0,0,0,0,39.999404,95.997242,"MULTIPOLYGON (((479067.426 6460115.242, 479059..."
3,7ece0648-b37b-4415-8490-ecbaaefd2b5f,1,HUS,2015-08-10 15:52,4,0,,,,,...,0,0,0,0,0,0,0,40.0,96.0,"MULTIPOLYGON (((506869.301 6463163.995, 506869..."
4,ec75acda-ce48-4abb-8dc1-9a315deec9af,1,HUS,2015-09-01 08:19,4,0,,,,,...,0,0,0,0,0,0,0,40.0,96.0,"MULTIPOLYGON (((506837.407 6463046.775, 506837..."


In [70]:
gdf_bd = gdf_bd.loc[:, ['OBJEKT_ID', 'DETALJTYP', 'ANDAMAL_1', 'ANDAMAL_1T', 'Shape_Area', 'geometry']].rename(columns=dict(
    OBJEKT_ID='building_id',
    DETALJTYP='detail_code',
    ANDAMAL_1='purpose',
    ANDAMAL_1T='description',
    Shape_Area='shape_area'
))
gdf_bd.loc[:, 'area'] = gdf_bd.loc[:, 'geometry'].area # The same as shape_area in m^2
gdf_bd.head()

Unnamed: 0,building_id,detail_code,purpose,description,shape_area,geometry,area
0,a9d891a3-c8b1-4c06-b4b1-98bb87a74227,HUS,130,Bostad; Sm hus friliggande,96.00291,"MULTIPOLYGON (((484523.880 6443262.612, 484517...",96.00291
1,c7480e52-7a02-4b6c-a4b4-4cf8c23b1cee,HUS,130,Bostad; Sm hus friliggande,113.079799,"MULTIPOLYGON (((512324.935 6410856.541, 512332...",113.079799
2,dddf8f86-11c6-4523-9364-819a86490a33,HUS,130,Bostad; Sm hus friliggande,95.997242,"MULTIPOLYGON (((479067.426 6460115.242, 479059...",95.997242
3,7ece0648-b37b-4415-8490-ecbaaefd2b5f,HUS,130,Bostad; Sm hus friliggande,96.0,"MULTIPOLYGON (((506869.301 6463163.995, 506869...",96.0
4,ec75acda-ce48-4abb-8dc1-9a315deec9af,HUS,130,Bostad; Sm hus friliggande,96.0,"MULTIPOLYGON (((506837.407 6463046.775, 506837...",96.0


In [72]:
gdf_bd.to_postgis("buildings", schema='built_env', con=engine)

### 3.2 Building features - ground space index

In [88]:
gdf_hex_bd = gpd.tools.overlay(gdf_hex.to_crs(3006),
                               gdf_bd[['building_id', 'geometry']], how='intersection')
gdf_hex_bd = gdf_hex_bd.dropna(how='any')
gdf_hex_bd.head()

Unnamed: 0,hex_id,building_id,geometry
0,86088049fffffff,d25f6d4d-6643-4cd6-bfa0-9a02092ad3c3,"POLYGON ((704464.361 6631753.550, 704468.433 6..."
1,86088049fffffff,8add0ae5-17c1-4b7b-bafa-139fe94bea5d,"POLYGON ((704382.048 6631702.644, 704374.427 6..."
2,86088049fffffff,75f83635-6531-43bf-b3c2-6173fd75bfd0,"POLYGON ((704695.750 6630529.140, 704694.741 6..."
3,86088049fffffff,53337aeb-3bc4-4f64-9319-b60f3a25755f,"POLYGON ((706360.225 6632403.326, 706356.920 6..."
4,86088049fffffff,f0f53b80-6aea-46d0-abac-f74361fca47d,"POLYGON ((704704.705 6630602.938, 704710.002 6..."


In [89]:
gdf_hex_bd.loc[:, 'area'] = gdf_hex_bd.loc[:, 'geometry'].area
building_area = gdf_hex_bd.groupby('hex_id')['area'].sum().reset_index()
building_area.loc[:, 'gsi'] = building_area.loc[:, 'area'] / (hex_area * 10**6)
building_area.head()

Unnamed: 0,hex_id,area,gsi
0,86088044fffffff,139347.357502,0.003857
1,860880497ffffff,105163.539826,0.002911
2,86088049fffffff,597225.347238,0.01653
3,8608804b7ffffff,116566.159595,0.003226
4,8608804d7ffffff,160004.151506,0.004429


### 3.3 Building features - health care service count

In [90]:
df_bd_hc = gdf_bd.loc[gdf_bd.purpose.isin([307, 318]), ['building_id', 'purpose']]
df_bd_hc = pd.merge(gdf_hex_bd, df_bd_hc, on='building_id', how='inner')
df_bd_hc = df_bd_hc.groupby('hex_id')['building_id'].nunique().reset_index()
df_bd_hc.head()

Unnamed: 0,hex_id,building_id
0,86088049fffffff,3
1,860880737ffffff,1
2,860880787ffffff,1
3,860882877ffffff,1
4,860882b37ffffff,2


## 4. Road network

In [102]:
# Read the study area
gdf_hex_rgs = gpd.read_file('results/mobi_seg_spatio_static.geojson')
gdf_hex_rgs = gdf_hex_rgs[['hex_id', 'deso_3', 'geometry']].drop_duplicates(subset=['hex_id'])

In [122]:
# Create bounding box for clipping OSM file
for deso_code, region_name in zip(('01', '12', '14'), ('Stockholm', 'Malmo', 'Gothenburg')):
    gdf_r = gdf_hex_rgs.loc[gdf_hex_rgs.deso_3==deso_code, :]
    geom = box(*gdf_r.total_bounds)
    gdf_r = gpd.GeoDataFrame([1], geometry=[geom], crs=4326)
    gdf_r = gdf_r.rename(columns={0: 'box'})
    rhelpers.gdf2poly(geodata=gdf_r, targetfile=f'dbs/geo/{region_name}_bounding.poly', buffer=0.03)

## 5. Combine segregation measuring with built-environment features

In [97]:
df_seg = gpd.read_file('results/mobi_seg_spatio_static.geojson')
df_seg.head()

Unnamed: 0,weekday,holiday,hex_id,S_income,S_birth_region,S_background,Foreign background,Not Sweden,Lowest income group,S_income_h,S_birth_region_h,S_background_h,Foreign background_h,Not Sweden_h,Lowest income group_h,deso,deso_3,geometry
0,0,0,86088049fffffff,0.16,0.640468,0.38981,0.305095,0.239688,0.31,0.053333,0.839965,0.743769,0.128115,0.10669,0.23,0188A0040,1,"POLYGON ((18.60606 59.78091, 18.66705 59.78702..."
1,0,0,86088059fffffff,0.086667,0.697943,0.51995,0.240025,0.201372,0.26,0.033333,0.766802,0.624291,0.187854,0.155466,0.25,0188A0080,1,"POLYGON ((18.32160 59.77558, 18.38253 59.78182..."
2,0,0,86088071fffffff,0.084969,0.856739,0.736124,0.129286,0.095226,0.25,0.12,0.88206,0.803987,0.098007,0.078627,0.3,0188A0180,1,"POLYGON ((18.74973 60.10015, 18.81113 60.10617..."
3,0,0,860880737ffffff,0.253333,0.664582,0.465207,0.267396,0.223612,0.36,0.06,0.851434,0.746308,0.126846,0.099044,0.27,0188A0170,1,"POLYGON ((18.56566 60.08194, 18.62700 60.08804..."
4,0,0,860880787ffffff,0.16,0.795806,0.672258,0.163871,0.136129,0.3,0.12,0.88206,0.803987,0.098007,0.078627,0.3,0188A0180,1,"POLYGON ((18.79018 60.01053, 18.85148 60.01653..."


In [98]:
df_b = pd.merge(pd.merge(stops_count, building_area, on='hex_id', how='outer'), df_bd_hc, on='hex_id', how='outer').fillna(0).drop(columns='area')
df_b = df_b.rename(columns={'building_id': 'hc_count'})
df_b = pd.merge(df_seg, df_b, on='hex_id', how='left')
df_b.head()

Unnamed: 0,weekday,holiday,hex_id,S_income,S_birth_region,S_background,Foreign background,Not Sweden,Lowest income group,S_income_h,...,S_background_h,Foreign background_h,Not Sweden_h,Lowest income group_h,deso,deso_3,geometry,num_stops,gsi,hc_count
0,0,0,86088049fffffff,0.16,0.640468,0.38981,0.305095,0.239688,0.31,0.053333,...,0.743769,0.128115,0.10669,0.23,0188A0040,1,"POLYGON ((18.60606 59.78091, 18.66705 59.78702...",5.009817,0.01653,3.0
1,0,0,86088059fffffff,0.086667,0.697943,0.51995,0.240025,0.201372,0.26,0.033333,...,0.624291,0.187854,0.155466,0.25,0188A0080,1,"POLYGON ((18.32160 59.77558, 18.38253 59.78182...",2.158927,0.011955,0.0
2,0,0,86088071fffffff,0.084969,0.856739,0.736124,0.129286,0.095226,0.25,0.12,...,0.803987,0.098007,0.078627,0.3,0188A0180,1,"POLYGON ((18.74973 60.10015, 18.81113 60.10617...",1.024106,0.006754,0.0
3,0,0,860880737ffffff,0.253333,0.664582,0.465207,0.267396,0.223612,0.36,0.06,...,0.746308,0.126846,0.099044,0.27,0188A0170,1,"POLYGON ((18.56566 60.08194, 18.62700 60.08804...",2.629462,0.011788,1.0
4,0,0,860880787ffffff,0.16,0.795806,0.672258,0.163871,0.136129,0.3,0.12,...,0.803987,0.098007,0.078627,0.3,0188A0180,1,"POLYGON ((18.79018 60.01053, 18.85148 60.01653...",2.131248,0.006873,1.0


### 4.1 Add DeSO population count

In [100]:
df_deso = pd.read_csv("dbs/DeSO/background_2019.csv")
df_deso = df_deso.loc[df_deso.background == 'Total', ['region', 'count']].rename(columns={'region':'deso', 'count': 'pop'})
df_b = pd.merge(df_b, df_deso, on='deso', how='left')

In [101]:
df_b.to_file('results/mobi_seg_spatio_static_built_env.geojson', driver='GeoJSON')