# Caclulate tree visibility statistics per district

[![colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ac-willeke/urban-climate/blob/main/notebooks/01_FROST_extract_climate_data.ipynb) [![github](https://img.shields.io/badge/GitHub-View%20on%20GitHub-blue?logo=github)](https://github.com/ac-willeke/)

**Author**: Willeke A'Campo

**Description:** This notebooks shows how to calculate the Ecosystem Service statistics for tree visibility and impact per district using DuckDB. The results are stored in a new table in the database and exported to GeoJSON.

**Documentation:** 

### Setup

In [1]:
import geopandas as gpd
import os
import glob # for finding files

# set temp dir to network drivve to avoid disk space issues
os.environ['TMPDIR'] = r"/home/NINA.NO/willeke.acampo/Mounts/P-Prosjekter2/152022_itree_eco_ifront_synliggjore_trars_rolle_i_okosyst/TEMP"

# TODO move to kedro pipeline
municipality = "oslo"
TEMP_DIR = os.environ['TMPDIR']
raw_dir = os.path.join(TEMP_DIR, "oslo", "01_raw")
interim_dir = os.path.join(TEMP_DIR, "oslo", "02_intermediate")
reporting_dir = os.path.join(TEMP_DIR, "oslo", "08_reporting")


# Define the table names
file_names = [
    f"{municipality}_study_area", 
    f"{municipality}_districts",
    f"{municipality}_bldg",
    f"{municipality}_res_bldg",
    f"{municipality}_green_space",
    f"{municipality}_open_space",
    f"{municipality}_public_open_space",
    f"{municipality}_private_open_space",
    f"{municipality}_tree_crowns"
    ]

table_names = [
    "study_area", 
    "districts", 
    "bldg", 
    "res_bldg", 
    "green_space",
    "open_space", 
    "public_open_space", 
    "private_open_space", 
    "tree_crowns"
    ]

district_geojson = os.path.join(raw_dir, f"{municipality}_districts.geojson")

**get district list**

In [2]:
# load the district data
districts = gpd.read_file(district_geojson)

# export a parquet file for each district (delomrade)
district_list = districts['delomradenummer'].unique()
district_list = sorted(district_list)

# print list of unique delomrade numbers 
print(f"Number of districts: {len(district_list)} \n")
print(f"Districts: {district_list}")

Number of districts: 60 

Districts: [30101, 30102, 30103, 30104, 30105, 30106, 30107, 30108, 30109, 30110, 30111, 30112, 30113, 30114, 30115, 30116, 30117, 30118, 30119, 30120, 30121, 30122, 30123, 30124, 30125, 30126, 30127, 30128, 30129, 30130, 30131, 30132, 30133, 30134, 30135, 30136, 30137, 30138, 30139, 30140, 30141, 30142, 30143, 30144, 30145, 30146, 30147, 30148, 30149, 30150, 30151, 30152, 30153, 30154, 30155, 30156, 30157, 30158, 30159, 30160]


### Data conversion | GeoJSON to GeoParquet   

Load data to GeoPandas Dataframe from Parquet file. If Parquet file does not exists create from geojson.

Checks:
- crs = "EPSG:25832"
- areas < 1 m2 are deleted (except for tree crown polygons)

In [3]:
# Define the parquet_dict
parquet_dict = {name: os.path.join(interim_dir, f"{name}.parquet") for name in file_names}

# {table_name: gdf}
gdf_dict = {}

def process_gdf(gdf, tbl):
    # remove areas smaller than 1m2 for all files except study_area and tree_crowns
    if tbl in ["study_area", "tree_crowns"]:
        print(f"Areas < 1m2 are not removed from {tbl}")
        if gdf.crs.to_epsg() != 25832:
            print(f"Reprojecting {tbl} to epsg:25832")
            gdf = gdf.to_crs(epsg=25832)
        return gdf
    
    else: 
        len_before = len(gdf)
        gdf = gdf[gdf.area > 1]
        len_after = len(gdf)
        print(f"Removed {len_before - len_after} rows from {tbl}")
    
        # if epsg is not 25832, reproject
        if gdf.crs.to_epsg() != 25832:
            print(f"Reprojecting {tbl} to epsg:25832")
            gdf = gdf.to_crs(epsg=25832)
    
    return gdf

# Check if the parquet files exist, if not convert to parquet
for file_name, tbl in zip(parquet_dict.keys(), table_names):
    parquet_file = parquet_dict[file_name]
    
    # If Parquet file does not exist
    if not os.path.exists(parquet_file):
        # convert all .geojson and .shp files to .parquet
        for raw_file in glob.glob(os.path.join(raw_dir, f'*{file_name}.geojson')) + glob.glob(os.path.join(raw_dir, f'*{file_name}.shp')):
            gdf = gpd.read_file(raw_file)
            gdf = process_gdf(gdf, tbl)
            
            # save to parquet
            print(f"Converting {raw_file} to parquet")
            
            # if tree_crowns, rename the columns: "nb_code", and "area_code" to "grunnkretsnummer" and "delomradenummer"
            if tbl == "tree_crowns":
                gdf.rename(columns={"nb_code": "grunnkretsnummer", "area_code": "delomradenummer"}, inplace=True)
                display(gdf.head())
            
            gdf.to_parquet(parquet_file, index=None, compression="snappy")
    
    # Open Parquet file
    gdf = gpd.read_parquet(parquet_file)
    print(f"Loaded {parquet_file}")
    
    # add to gdf_dict
    gdf_dict[tbl] = gdf 

Loaded /home/NINA.NO/willeke.acampo/Mounts/P-Prosjekter2/152022_itree_eco_ifront_synliggjore_trars_rolle_i_okosyst/TEMP/oslo/02_intermediate/oslo_study_area.parquet
Loaded /home/NINA.NO/willeke.acampo/Mounts/P-Prosjekter2/152022_itree_eco_ifront_synliggjore_trars_rolle_i_okosyst/TEMP/oslo/02_intermediate/oslo_districts.parquet
Loaded /home/NINA.NO/willeke.acampo/Mounts/P-Prosjekter2/152022_itree_eco_ifront_synliggjore_trars_rolle_i_okosyst/TEMP/oslo/02_intermediate/oslo_bldg.parquet
Loaded /home/NINA.NO/willeke.acampo/Mounts/P-Prosjekter2/152022_itree_eco_ifront_synliggjore_trars_rolle_i_okosyst/TEMP/oslo/02_intermediate/oslo_res_bldg.parquet
Loaded /home/NINA.NO/willeke.acampo/Mounts/P-Prosjekter2/152022_itree_eco_ifront_synliggjore_trars_rolle_i_okosyst/TEMP/oslo/02_intermediate/oslo_green_space.parquet
Loaded /home/NINA.NO/willeke.acampo/Mounts/P-Prosjekter2/152022_itree_eco_ifront_synliggjore_trars_rolle_i_okosyst/TEMP/oslo/02_intermediate/oslo_open_space.parquet
Loaded /home/NINA.

Unnamed: 0,OBJECTID,tree_id,crown_id_2014,crown_id_2021,itree_spec,teig_undervisning,crown_origin,norwegian_name,taxon_genus,common_name,...,totben_cap_2014,totben_cap_2017,totben_cap,Shape_Length,Shape_Area,TB_CAP,TB_CAP_CA,totben_cap2014,totben_cap_crown_area,geometry
0,1,1938.0,105123.0,194,1,0,laserdata 2021,Løvtre,Sorbus,Whitebeam,...,29.3248,30.79803,1886.487974,34.999994,44.749985,6812.57031,75.828293,6813.0,76.0,"POLYGON ((600238.600 6643856.750, 600238.100 6..."
1,2,1940.0,101388.0,212,1,0,laserdata 2021,Løvtre,Sorbus,European mountain ash,...,33.757,35.52469,2168.098459,48.999992,64.999979,6140.44043,110.882858,6140.0,111.0,"POLYGON ((600217.600 6643845.750, 600215.600 6..."
2,3,1936.0,101354.0,276,1,0,laserdata 2021,Løvtre,Populus,cottonwood spp,...,2.0083,2.03625,106.038059,45.999992,80.249974,14302.9336,83.836853,14303.0,84.0,"POLYGON ((600158.100 6643817.750, 600157.600 6..."
3,4,1951.0,101189.0,601,1,0,laserdata 2021,Løvtre,Betula,European white birch,...,5.7964,5.93836,329.36783,26.999996,29.24999,22161.4414,82.225548,22161.0,82.0,"POLYGON ((599866.100 6643679.250, 599864.600 6..."
4,5,1954.0,101169.0,654,1,0,laserdata 2021,Løvtre,Acer,Norway maple,...,23.6911,24.98433,1569.19147,24.999996,21.999993,6302.82861,262.773773,6303.0,263.0,"POLYGON ((599823.600 6643651.750, 599822.600 6..."


Loaded /home/NINA.NO/willeke.acampo/Mounts/P-Prosjekter2/152022_itree_eco_ifront_synliggjore_trars_rolle_i_okosyst/TEMP/oslo/02_intermediate/oslo_tree_crowns.parquet


### Load all other tabels and fanout based on DISTRICT NUMBER value (not geometry)

In [4]:
def export_by_district(gdf_dict, district_list, col_district):
    # for each district
    for number in district_list:
        # for each GeoDataFrame
        for name, gdf in gdf_dict.items():
            # if col_district not in gdf.columns, continue
            if col_district not in gdf.columns:
                continue
            
            print(f"Exporting {name} for district {number}")

            gdf_fan = gdf[gdf[col_district] == number]
            
            # if gdf_fan is empty, continue
            if gdf_fan.empty:
                print(f"{name} for district {number} is an empty gdf")
                continue
            
            parquet_file = os.path.join(interim_dir, "per_district", "parquet", f"{name}_{number}.parquet")
            geojson_file = os.path.join(interim_dir, "per_district", "geojson", f"{name}_{number}.geojson")
            
            # if parquet file already exists, continue
            if os.path.exists(parquet_file):
                print(f"Parquet file for {name} and district {number} already exists")
                continue
            else:
                gdf_fan.to_parquet(
                    path = parquet_file,
                    index = None, 
                    compression = "snappy"
                )
            
            # if geojson file already exists, continue
            if os.path.exists(geojson_file):
                print(f"Geojson file for {name} and district {number} already exists")
                continue
            else:
                gdf_fan.to_file(
                    geojson_file,
                    driver='GeoJSON'
                )
        

In [5]:
# update columns of 

gdf_dict["tree_crowns"]["area_code"] = gdf_dict["tree_crowns"]["delomradenummer"]
gdf_dict["tree_crowns"]["nb_code"] = gdf_dict["tree_crowns"]["grunnkretsnummer"]

display(gdf_dict["tree_crowns"].head())

Unnamed: 0,OBJECTID,tree_id,crown_id_2014,crown_id_2021,itree_spec,teig_undervisning,crown_origin,norwegian_name,taxon_genus,common_name,...,totben_cap,Shape_Length,Shape_Area,TB_CAP,TB_CAP_CA,totben_cap2014,totben_cap_crown_area,geometry,area_code,nb_code
0,1,1938.0,105123.0,194,1,0,laserdata 2021,Løvtre,Sorbus,Whitebeam,...,1886.487974,34.999994,44.749985,6812.57031,75.828293,6813.0,76.0,"POLYGON ((600238.600 6643856.750, 600238.100 6...",30142.0,
1,2,1940.0,101388.0,212,1,0,laserdata 2021,Løvtre,Sorbus,European mountain ash,...,2168.098459,48.999992,64.999979,6140.44043,110.882858,6140.0,111.0,"POLYGON ((600217.600 6643845.750, 600215.600 6...",30142.0,
2,3,1936.0,101354.0,276,1,0,laserdata 2021,Løvtre,Populus,cottonwood spp,...,106.038059,45.999992,80.249974,14302.9336,83.836853,14303.0,84.0,"POLYGON ((600158.100 6643817.750, 600157.600 6...",30142.0,
3,4,1951.0,101189.0,601,1,0,laserdata 2021,Løvtre,Betula,European white birch,...,329.36783,26.999996,29.24999,22161.4414,82.225548,22161.0,82.0,"POLYGON ((599866.100 6643679.250, 599864.600 6...",30142.0,
4,5,1954.0,101169.0,654,1,0,laserdata 2021,Løvtre,Acer,Norway maple,...,1569.19147,24.999996,21.999993,6302.82861,262.773773,6303.0,263.0,"POLYGON ((599823.600 6643651.750, 599822.600 6...",30142.0,


In [6]:
gdf_dict["tree_crowns"]["area_code"] = gdf_dict["tree_crowns"]["delomradenummer"]
gdf_dict["tree_crowns"]["nb_code"] = gdf_dict["tree_crowns"]["grunnkretsnummer"]
export_by_district(gdf_dict, district_list, "delomradenummer")

# export the tree_crowns using a differnt column name

export_by_district(gdf_dict, district_list, "area_code")


Exporting districts for district 30101
Parquet file for districts and district 30101 already exists
Exporting bldg for district 30101
Parquet file for bldg and district 30101 already exists
Exporting res_bldg for district 30101
Parquet file for res_bldg and district 30101 already exists
Exporting green_space for district 30101
Parquet file for green_space and district 30101 already exists
Exporting open_space for district 30101
Parquet file for open_space and district 30101 already exists
Exporting public_open_space for district 30101
Parquet file for public_open_space and district 30101 already exists
Exporting private_open_space for district 30101
Parquet file for private_open_space and district 30101 already exists
Exporting tree_crowns for district 30101
Exporting districts for district 30102
Parquet file for districts and district 30102 already exists
Exporting bldg for district 30102
Parquet file for bldg and district 30102 already exists
Exporting res_bldg for district 30102
Par

### Load all other tables and SPATIALLY split by district

In [7]:
# DO NOT RUN THIS CELL AUTOMATICALLY
raise SystemExit("Stop right there!")

# Rest of your code...

for district_number in district_list:
    # load district {district_number} from parquet
    con = duckdb.connect(database=":memory:", read_only=False)
    con.install_extension("spatial")
    con.load_extension("spatial")

    # create a duckdb table for the district
    district_path = os.path.join(interim_dir, f"district_{district_number}.parquet")
    print(f"District number: {district_number}")

    con.execute(
        f"""
        CREATE TABLE district_{district_number}
        AS SELECT *, ST_GeomFromWKB(geometry) 
        FROM parquet_scan('{district_path}')
        """
    )

    # Load all other tables
    for key,table in zip(parquet_dict.keys(), table_names):
        if table != 'districts':
            con.execute(
                f"""
                CREATE TABLE {table} 
                AS SELECT *, ST_GeomFromWKB(geometry) 
                FROM parquet_scan('{parquet_dict[key]}')
                """
            )

    # Spatially clip all other tables to geometry of 'district_{district_number}' and create a new table {table}_{district_number} and export it to {table}_{district_number}.parquet
    for table in table_names:
        if table != 'districts':
            con.execute(
                f"""
                CREATE TABLE {table}_{district_number} 
                AS SELECT *, ST_GeomFromWKB(geometry) as geometry
                FROM {table} 
                WHERE ST_Intersects(ST_GeomFromWKB(geometry), (SELECT ST_GeomFromWKB(geometry) FROM district_{district_number}))
                """
            )
            con.execute(
                f"""
                COPY (SELECT * FROM {table}_{district_number}) TO '{interim_dir}/{table}_{district_number}.parquet' (FORMAT 'parquet')
                """
            )
            
    con.close()

SystemExit: Stop right there!

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:

# DO NOT RUN THIS CELL AUTOMATICALLY
raise SystemExit("Stop right there!")
# Define the parquet_dict
parquet_dict = {
    name: os.path.join(interim_dir, f"{name}.parquet") 
    for name in file_names}


# {table_name: gdf}
gdf_dict = {}
 
# Check if the parquet files exist, if not convert  to parquet
for key,tbl in zip(parquet_dict.keys(), table_names):
    
    # ------------------------------
    # IF PARQUET FILE DOES NOT EXIST
    # ------------------------------
    if not os.path.exists(parquet_dict[key]):
                print(f"PARQUET file for {key} EXISTS, add to GDF_DICT")
        # convert all .geojson and .shp files to .parquet
        for raw_file, parquet_file in zip(os.listdir(raw_dir), parquet_dict.keys()):
            
            # if file is .geojson or .shp
            if raw_file.endswith(".geojson") or raw_file.endswith(".shp"):
                # if file is already converted to parquet, continue
                if os.path.exists(parquet_file):
                    continue
                # else convert to parquet 
                else: 
                    gdf = gpd.read_file(os.path.join(raw_dir, raw_file))
                    
                    # remove areas smaller than 1m2 for all files 
                    if key != "tree_crowns":
                        print(f"Removing areas smaller than 1 m2 from {key}")
                        len_before = len(gdf)
                        gdf = gdf[gdf.area > 1]
                        len_after = len(gdf)
                        print(f"Removed {len_before - len_after} rows")
                    
                    print(f"CRS of {key} is {gdf.crs}")
                    # if epsg is not 25832, reproject and overwrite parquet
                    if gdf.crs.to_epsg() != 25832:
                        print(f"Reprojecting {key} to epsg:25832")
                        gdf = gdf.to_crs(epsg=25832)
                    
                    # export to parquet
                    print(f"Export {raw_file} to parquet")
                    gdf.to_parquet(
                        path = interim_dir + "/" + raw_file.split(".")[0] + ".parquet",
                        index = None, 
                        compression = "snappy"
                    )
                        
    print(f"PARQUET file for {key} EXISTS, add to GDF_DICT")
    
    # -----------------
    # OPEN PARQUET FILE
    # -----------------
    gdf = gpd.read_parquet(parquet_dict[key])
    
    # remove areas smaller than 1m2 for all files 
    if key != f"{municipality}_tree_crowns":
        print(f"Removing areas smaller than 1 m2 from {key}")
        len_before = len(gdf)
        gdf = gdf[gdf.area > 1]
        len_after = len(gdf)
        print(f"Removed {len_before - len_after} rows")
    
    print(f"CRS of {key} is {gdf.crs}")
    
    # if epsg is not 25832, reproject
    if gdf.crs.to_epsg() != 25832:
        print(f"Reprojecting {key} to epsg:25832")
        gdf = gdf.to_crs(epsg=25832)

    # add to gdf_dict
    print(f"Adding {tbl} to gdf_dict")
    gdf_dict[tbl] = gdf 