## Overview

### Structure

The pipeline is as follows:

Download the city gml files -> Convert to shapefiles -> Divide into grids -> Calculate UMP for each grid -> Save as y

Loop through each grid, download sentinel imagery, store and write as tensor 

### Datasets

- X:
    - Sentinel
- Y:
    - Tokyo (Japan, 2021) https://www.geospatial.jp/ckan/dataset/plateau-tokyo23ku/resource/0bab2b7f-6962-41c8-872f-66ad9b40dcb1?inner_span=True
    - Osaka (Japan, 2021) ^ 
    - New York (USA, 2019) https://github.com/opencitymodel/opencitymodel 

## Import Libraries

In [1]:
import geopandas as gpd
import pandas as pd
from glob import glob
import fiona
import os.path
from multiprocessing import Pool
from itertools import repeat

## Convert GML to shp

### Function Definitions

Convert all of the GML files in a folder into a single shapefile

In [5]:
def gml_to_feather(in_path, out_path, mode= None, log_name= "gml_convert", src_crs= "EPSG:6668", tgt_src= "EPSG:3857"):
    """
    Takes in a gml file and outputs it as a feather file\n
    W/R with feather files is much faster and takes up much less space than using shp files\n
    # Parameters:\n
    - in_path: The path for the gml file\n
    - out_path: The output path for the shape file, must end with a .shp\n
    - mode: 
        - 'o' = overwrites any file at output path, \n
        - None = raises error if file already exists\n
    - src_crs: Source projection\n
    - tgt_src: Target projection\n
    """
    # Extracts features
    with fiona.open(in_path, 'r') as src:
        features = list(src)

    # Converts and places it in geopandas format
    # There seems to be some gml files without the measured height column, will try to log those files in
    gdf = gpd.GeoDataFrame.from_features(features)
    try:
        gdf = gdf[['measuredHeight', 'geometry']]
        gdf.rename(columns={'measuredHeight':'height'}, inplace= True)
    except Exception as e:
        print(f"{e}: {os.path.basename(in_path)}")
        if not log_name is None:
            if not os.path.exists("logs"):
                os.makedirs("logs")
            with open(f"logs/{log_name}.txt", "a") as f:
                f.write(in_path + "\n")
        return len(gdf)

    # Remove the NaN values
    gdf = gdf.dropna().reset_index(drop= True)

    # Covert it to correct projection and strip to polygon instead from multi polygon

    # There is key error with somehow, plus most of the shapes are negligible, hence we will only be taking the first one
    try:
        gdf = gdf.explode(index_parts= True).set_crs(src_crs).to_crs(tgt_src).loc[(slice(None), slice(0)), :].reset_index(drop= True)
    except Exception as e:
        print(f"{e}: {os.path.basename(in_path)}")
        if not log_name is None:
            if not os.path.exists("logs"):
                os.makedirs("logs")
            with open(f"logs/{log_name}.txt", "a") as f:
                f.write(in_path + "\n")
        return len(gdf)


    # Convert coordinates from 2D to 3D
    gdf_geometry = gpd.GeoSeries.from_wkb(gdf.to_wkb(output_dimension= 2)["geometry"])
    gdf.drop(["geometry"], axis= 1, inplace= True)
    gdf = gpd.GeoDataFrame(gdf, geometry= gdf_geometry)

    # Check if parent directory exists
    if not os.path.exists(os.path.dirname(out_path)):
        os.makedirs(os.path.dirname(out_path))
        
    # Outputs to the desired path
    if os.path.exists(out_path):
        if mode == "a":
            gdf.to_feather(out_path, mode= "a")
        elif mode == "o":
            gdf.to_feather(out_path)
        else:
            raise FileExistsError("Output path already exists")
    else:
        gdf.to_feather(out_path)
    
    return 0

def batch_gml_to_feather(in_dir, out_path, n_processes= 12, log_name= None, mode= None, src_crs= "EPSG:6668", tgt_src= "EPSG:3857"):

    # Get all the paths of the gml files
    in_paths = glob(f"{in_dir}/*.gml")
    print("Total input files:", len(in_paths))

    # Reads the gml file and extract features
    with Pool(processes= n_processes) as pool:
        r = pool.starmap(
            gml_to_feather, 
            zip(in_paths, 
                [f'{in_dir}/temp/{os.path.basename(path).replace(".gml", ".feather")}' for path in in_paths], 
                repeat(mode), 
                repeat(log_name),
                repeat(src_crs),
                repeat(tgt_src)))

    # Check for invalid buildings
    print(f"There are {sum(r)} invalid buildings from {len(list(filter(lambda x: x > 0, r)))} files")

    # Get all the paths of the shp files
    in_paths = glob(f"{in_dir}/temp/*.feather")
    print("Total files to merge:", len(in_paths))

    gdfs = [gpd.read_feather(in_path) for in_path in in_paths]
    gdf = gpd.GeoDataFrame(pd.concat(gdfs)).reset_index(drop= True)
    gdf.to_feather(out_path)

    for temp_file in in_paths:
        os.remove(temp_file)

    return gdf

### Tokyo

In [6]:
in_dir = "data/13100_tokyo23-ku_2020_citygml_3_2_op/udx/bldg"
out_path = "data/full_Tokyo_plateau/tokyo_full.feather"

batch_gml_to_feather(in_dir, out_path, mode= "o", log_name= "tokyo")

Total input files: 671
cannot do slice indexing on Index with these indexers [0] of type int: 53392642_bldg_6697_2_op.gml
cannot do slice indexing on Index with these indexers [0] of type int: 53392641_bldg_6697_2_op.gml
cannot do slice indexing on Index with these indexers [0] of type int: 53392633_bldg_6697_2_op.gml
cannot do slice indexing on Index with these indexers [0] of type int: 53393631_bldg_6697_2_op.gml
cannot do slice indexing on Index with these indexers [0] of type int: 53392663_bldg_6697_2_op.gml
cannot do slice indexing on Index with these indexers [0] of type int: 53392653_bldg_6697_2_op.gml
cannot do slice indexing on Index with these indexers [0] of type int: 53393671_bldg_6697_2_op.gml
cannot do slice indexing on Index with these indexers [0] of type int: 53392651_bldg_6697_2_op.gml
cannot do slice indexing on Index with these indexers [0] of type int: 53392662_bldg_6697_2_op.gml
cannot do slice indexing on Index with these indexers [0] of type int: 53393683_bldg_6

Unnamed: 0,height,geometry
0,6.1,"POLYGON ((15565422.798 4245286.909, 15565416.2..."
1,3.0,"POLYGON ((15565401.386 4245273.087, 15565398.9..."
2,3.5,"POLYGON ((15563817.297 4264833.695, 15563811.6..."
3,11.8,"POLYGON ((15563065.418 4265104.495, 15563064.8..."
4,2.5,"POLYGON ((15563458.540 4264729.005, 15563431.3..."
...,...,...
1735748,8.1,"POLYGON ((15547325.623 4251960.079, 15547330.5..."
1735749,6.3,"POLYGON ((15547878.824 4251983.939, 15547872.9..."
1735750,8.2,"POLYGON ((15548373.420 4252264.298, 15548372.5..."
1735751,12.6,"POLYGON ((15548263.540 4251615.712, 15548262.3..."


### Osaka

In [7]:
in_dir = "data/osaka/udx/bldg"
out_path = "data/osaka/osaka_full.feather"

batch_gml_to_feather(in_dir, out_path, mode= "o", log_name= "osaka")

Total input files: 269
"['measuredHeight'] not in index": 51357370_bldg_6697_op.gml
"['measuredHeight'] not in index": 52350389_bldg_6697_op.gml
"['measuredHeight'] not in index": 52350378_bldg_6697_op.gml
"['measuredHeight'] not in index": 52350379_bldg_6697_op.gml
"['measuredHeight'] not in index": 52350368_bldg_6697_op.gml
"['measuredHeight'] not in index": 52350422_bldg_6697_op.gml
"['measuredHeight'] not in index": 52350480_bldg_6697_op.gml
"['measuredHeight'] not in index": 52350470_bldg_6697_op.gml
There are 12 invalid buildings from 8 files
Total files to merge: 261


Unnamed: 0,height,geometry
0,4.4,"POLYGON ((15072751.097 4117217.254, 15072744.0..."
1,5.2,"POLYGON ((15072760.795 4117204.681, 15072751.4..."
2,11.5,"POLYGON ((15072609.643 4114039.178, 15072607.8..."
3,6.2,"POLYGON ((15072558.583 4113986.948, 15072558.0..."
4,6.3,"POLYGON ((15072645.993 4114000.366, 15072628.9..."
...,...,...
544275,9.0,"POLYGON ((15089601.610 4130609.416, 15089594.1..."
544276,9.1,"POLYGON ((15089727.284 4130200.798, 15089723.3..."
544277,5.6,"POLYGON ((15089513.115 4130506.862, 15089518.6..."
544278,8.9,"POLYGON ((15089905.222 4130629.136, 15089897.1..."


### New York

In [8]:
in_dir = "data/NewYork_2"
out_path = "data/NewYork_2/new_york.feather"
src_crs = "EPSG:4326"

batch_gml_to_feather(in_dir, out_path, mode= "o", log_name= "new_york", src_crs= src_crs)

Total input files: 170
There are 0 invalid buildings from 0 files
Total files to merge: 170


Unnamed: 0,height,geometry
0,5.73,"POLYGON ((-8209085.418 5566790.279, -8209067.0..."
1,5.73,"POLYGON ((-8209070.390 5566764.459, -8209066.2..."
2,4.38,"POLYGON ((-8209005.825 5566671.511, -8208999.0..."
3,5.73,"POLYGON ((-8209225.236 5566628.949, -8209213.3..."
4,5.73,"POLYGON ((-8209101.448 5566623.003, -8209085.8..."
...,...,...
5716434,4.38,"POLYGON ((-8215328.048 5027787.294, -8215324.3..."
5716435,4.74,"POLYGON ((-8215264.930 5027796.217, -8215262.7..."
5716436,5.17,"POLYGON ((-8215139.829 5027478.327, -8215138.1..."
5716437,5.45,"POLYGON ((-8215386.792 5027729.412, -8215384.6..."


## Examining problem files

In [3]:
with open("logs/tokyo.txt") as f:
    files = f.read()

files = files.split("\n")
problem_gdfs = []

for f in files:
    # Extracts features
    with fiona.open(f, 'r') as src:
        features = list(src)
    
    problem_gdfs.append(gpd.GeoDataFrame.from_features(features))

problem_gdfs = gpd.GeoDataFrame(pd.concat(problem_gdfs)).reset_index(drop= True)
problem_gdfs = problem_gdfs[["geometry", "measuredHeight"]]

problem_gdfs

Unnamed: 0,geometry,measuredHeight
0,MULTIPOLYGON EMPTY,2.8
1,MULTIPOLYGON EMPTY,3.6
2,MULTIPOLYGON EMPTY,3.6
3,MULTIPOLYGON EMPTY,16.1
4,MULTIPOLYGON EMPTY,9.3
...,...,...
3649,MULTIPOLYGON EMPTY,6.2
3650,MULTIPOLYGON EMPTY,42.2
3651,MULTIPOLYGON EMPTY,10.3
3652,MULTIPOLYGON EMPTY,19.3


In [27]:
import xml.dom.minidom

doc = xml.dom.minidom.parse(f)
coords = doc.getElementsByTagName("bldg:lod1Solid")[0].getElementsByTagName("gml:Polygon")[0].getElementsByTagName("gml:posList")[0].childNodes[0].nodeValue
coords = coords.split(" ")

coords

'35.538511979563104 139.77765001365444 3.189 35.53849370611842 139.77761010768472 3.189 35.538521478453596 139.77758954853826 3.189 35.538530785801086 139.77758275206293 3.189 35.53857665737096 139.77754891348948 3.189 35.53857689467651 139.777549430546 3.189 35.5385884908272 139.7775410499349 3.189 35.538587371071735 139.7775386145889 3.189 35.53859090520818 139.7775361782256 3.189 35.53861053745228 139.77757884694498 3.189 35.538607003314475 139.7775812822042 3.189 35.53860674525613 139.77758071994626 3.189 35.538564738145176 139.77761107976545 3.189 35.538511979563104 139.77765001365444 3.189'

In [4]:
with open("logs/osaka.txt") as f:
    files = f.read()

files = files.split("\n")
problem_gdfs = []

for f in files:
    # Extracts features
    with fiona.open(f, 'r') as src:
        features = list(src)
    
    problem_gdfs.append(gpd.GeoDataFrame.from_features(features))

problem_gdfs = gpd.GeoDataFrame(pd.concat(problem_gdfs)).reset_index(drop= True)
# problem_gdfs = problem_gdfs[["geometry", "measuredHeight"]]
problem_gdfs

Unnamed: 0,geometry,gml_id,建物ID,枝番,prefecture,city,key,codeValue,theme,imageURI,mimeType
0,"MULTIPOLYGON Z (((135.37611 34.64792 1.25830, ...",BLD_4e8cebca-8759-4283-9f2c-099eeac4a7ae,27100-bldg-1,1.0,27.0,27100.0,2.0,2.0,,,
1,"MULTIPOLYGON Z (((135.37730 34.64591 2.22380, ...",BLD_96cc393f-efde-464e-8e90-129c4ccad2ef,27100-bldg-3,1.0,27.0,27100.0,2.0,2.0,,,
2,"MULTIPOLYGON Z (((135.37911 34.64358 3.14470, ...",BLD_80a14177-1694-4eab-b843-7c527b887402,27100-bldg-4,1.0,27.0,27100.0,2.0,2.0,,,
3,"MULTIPOLYGON Z (((135.37907 34.64354 3.28320, ...",BLD_9f6e5948-6968-4192-87aa-458ebc269f55,27100-bldg-5,1.0,27.0,27100.0,2.0,2.0,,,
4,"MULTIPOLYGON Z (((135.37687 34.64791 3.55590, ...",BLD_a3b46c03-5a24-4494-bf90-f07d3738e228,27100-bldg-2,1.0,27.0,27100.0,2.0,2.0,,,
5,,fme-gen-ece9cd30-11f8-492b-9a2d-ad6153a7f88a,,,,,,,rgbTexture,,
6,,fme-gen-ebed22fd-125f-45b4-815d-9080d68b9498,,,,,,,rgbTexture,,
7,,fme-gen-d9f03ada-8ffd-418f-afb2-067ab5012918,,,,,,,rgbTexture,,
8,,fme-gen-6e1841d6-676a-4d90-b39d-a70ee4691c0f,,,,,,,rgbTexture,,
9,,fme-gen-0dc34428-8b95-46c5-8dd9-a976f8c3f919,,,,,,,rgbTexture,52350422_bldg_6697_appearance/27100-bldg-32887...,image/jpg


Unnamed: 0,geometry,measuredHeight


## Divide into grids

### Load files

In [None]:
# Tokyo


# Osaka

# New York

### Extract Points from datasets

### Get concave hulls that are bounding the dataset

### Divide into grids based on min and max point, then filter out valid grids

## Calculate UMP and export as y

## Download Sentinel and export as X