# Reduce size of a geojson file
- [geopandas](https://geopandas.org/en/stable/index.html)
- [shapely](https://shapely.readthedocs.io/en/stable/index.html)

In [1]:
import os
import re
import zipfile
from pathlib import Path

import geopandas
import pandas as pd
import shapely

In [2]:
DATA_PATH = Path().resolve().parent.parent.parent/"data"
print(DATA_PATH)

/home/pyuser/workspace/data


In [3]:
target = DATA_PATH/"datasets"/"geojson"/"acores_central_concelhos.geojson"

raw_size = os.path.getsize(target)
raw_geojson = geopandas.read_file(target)
print(raw_size)
raw_geojson.head()

3809372


Unnamed: 0,Dico,ILHA,N_FREGUESI,NUT1_DSG,NUT2_DSG,NUT3_DSG,AREA_HA,MUNICIPIO,Ilha_1,geometry
0,4301,ILHA TERCEIRA (AÇORES),19,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,23899.66,ANGRA DO HEROÍSMO,,"MULTIPOLYGON (((476244.715 4294864.237, 476268..."
1,4302,ILHA TERCEIRA (AÇORES),11,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,16127.06,PRAIA DA VITÓRIA,,"MULTIPOLYGON (((477611.616 4294952.730, 477610..."
2,4401,ILHA DA GRACIOSA (AÇORES),4,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,6065.78,SANTA CRUZ DA GRACIOSA,,"MULTIPOLYGON (((411000.327 4327966.149, 411001..."
3,4501,ILHA DE SÃO JORGE (AÇORES),5,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,12626.4,CALHETA DE S. JORGE,,"MULTIPOLYGON (((412150.957 4279935.107, 412172..."
4,4502,ILHA DE SÃO JORGE (AÇORES),6,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,11738.38,VELAS,,"MULTIPOLYGON (((385605.028 4290516.349, 385603..."


## Reduce/optimize types of features

In [4]:
feature_optimized_geojson = raw_geojson.drop(columns=["Ilha_1", "NUT2_DSG", "NUT3_DSG"])
feature_optimized_geojson.head()

Unnamed: 0,Dico,ILHA,N_FREGUESI,NUT1_DSG,AREA_HA,MUNICIPIO,geometry
0,4301,ILHA TERCEIRA (AÇORES),19,REGIÃO AUTÓNOMA DOS AÇORES,23899.66,ANGRA DO HEROÍSMO,"MULTIPOLYGON (((476244.715 4294864.237, 476268..."
1,4302,ILHA TERCEIRA (AÇORES),11,REGIÃO AUTÓNOMA DOS AÇORES,16127.06,PRAIA DA VITÓRIA,"MULTIPOLYGON (((477611.616 4294952.730, 477610..."
2,4401,ILHA DA GRACIOSA (AÇORES),4,REGIÃO AUTÓNOMA DOS AÇORES,6065.78,SANTA CRUZ DA GRACIOSA,"MULTIPOLYGON (((411000.327 4327966.149, 411001..."
3,4501,ILHA DE SÃO JORGE (AÇORES),5,REGIÃO AUTÓNOMA DOS AÇORES,12626.4,CALHETA DE S. JORGE,"MULTIPOLYGON (((412150.957 4279935.107, 412172..."
4,4502,ILHA DE SÃO JORGE (AÇORES),6,REGIÃO AUTÓNOMA DOS AÇORES,11738.38,VELAS,"MULTIPOLYGON (((385605.028 4290516.349, 385603..."


In [5]:
feature_optimized_geojson.to_file(DATA_PATH/"datasets"/"geojson"/"acores_central_concelhos_feature_optimized.geojson", driver="GeoJSON")

In [6]:
feature_optimized_size = os.path.getsize(DATA_PATH/"datasets"/"geojson"/"acores_central_concelhos_feature_optimized.geojson")
print("Optimized size:", feature_optimized_size)
print("Decrease %:", round(((feature_optimized_size/raw_size)-1)*100, 5))

Optimized size: 3808418
Decrease %: -0.02504


## Remove white space in file
- columns of type string need attention to not remove space between words

In [7]:
whitespace_optimized_geojson = raw_geojson.replace(" ", "_", regex=True).to_json(ensure_ascii=False)
print(len(whitespace_optimized_geojson))
whitespace_optimized_geojson[:1000]

2911002


'{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {"Dico": "4301", "ILHA": "ILHA_TERCEIRA_(AÇORES)", "N_FREGUESI": "19", "NUT1_DSG": "REGIÃO_AUTÓNOMA_DOS_AÇORES", "NUT2_DSG": "REGIÃO_AUTÓNOMA_DOS_AÇORES", "NUT3_DSG": "REGIÃO_AUTÓNOMA_DOS_AÇORES", "AREA_HA": "23899.66", "MUNICIPIO": "ANGRA_DO_HEROÍSMO", "Ilha_1": null}, "geometry": {"type": "MultiPolygon", "coordinates": [[[[476244.71494166786, 4294864.2366526285], [476268.4333811403, 4294860.226543266], [476286.96213518886, 4294860.726458786], [476313.25041803933, 4294854.236437442], [476338.7786414076, 4294847.236319872], [476348.8779605238, 4294845.236273366], [476362.3071012544, 4294845.236212132], [476365.09694130113, 4294844.726198705], [476367.6866903216, 4294843.736286588], [476370.52652228496, 4294842.226273765], [476372.936374672, 4294839.726262071], [476375.21623868047, 4294838.2362512555], [476378.4059562306, 4294837.726236863], [476383.9356430812, 4294837.726211553], [476406.6541632897

In [8]:
optimized_file_str = re.sub(r"\s", "", whitespace_optimized_geojson)

print(len(optimized_file_str))
print(optimized_file_str[:1000])

2766529
{"type":"FeatureCollection","features":[{"id":"0","type":"Feature","properties":{"Dico":"4301","ILHA":"ILHA_TERCEIRA_(AÇORES)","N_FREGUESI":"19","NUT1_DSG":"REGIÃO_AUTÓNOMA_DOS_AÇORES","NUT2_DSG":"REGIÃO_AUTÓNOMA_DOS_AÇORES","NUT3_DSG":"REGIÃO_AUTÓNOMA_DOS_AÇORES","AREA_HA":"23899.66","MUNICIPIO":"ANGRA_DO_HEROÍSMO","Ilha_1":null},"geometry":{"type":"MultiPolygon","coordinates":[[[[476244.71494166786,4294864.2366526285],[476268.4333811403,4294860.226543266],[476286.96213518886,4294860.726458786],[476313.25041803933,4294854.236437442],[476338.7786414076,4294847.236319872],[476348.8779605238,4294845.236273366],[476362.3071012544,4294845.236212132],[476365.09694130113,4294844.726198705],[476367.6866903216,4294843.736286588],[476370.52652228496,4294842.226273765],[476372.936374672,4294839.726262071],[476375.21623868047,4294838.2362512555],[476378.4059562306,4294837.726236863],[476383.9356430812,4294837.726211553],[476406.6541632897,4294838.726107615],[476415.05359588226,4294840.236

In [9]:
with open(DATA_PATH/"datasets"/"geojson"/"acores_central_concelhos_whitespace_optimized.geojson", "w") as text_file:
    text_file.write(optimized_file_str)

In [10]:
whitespace_optimized_size = os.path.getsize(DATA_PATH/"datasets"/"geojson"/"acores_central_concelhos_whitespace_optimized.geojson")
print("Optimized size:", whitespace_optimized_size)
print("Decrease %:", round(((whitespace_optimized_size/raw_size)-1)*100, 5))

Optimized size: 2766624
Decrease %: -27.37323


## Reduce decimal precision of points
- based on answer https://gis.stackexchange.com/a/476234
- using [shapely.set_precision()](https://shapely.readthedocs.io/en/stable/reference/shapely.set_precision.html#shapely.set_precision)
- more about precison on https://xkcd.com/2170/

In [11]:
geometry_optimized_geojson = raw_geojson.assign(geometry=lambda x: shapely.set_precision(x["geometry"], grid_size=0.000001))
geometry_optimized_geojson.head()

Unnamed: 0,Dico,ILHA,N_FREGUESI,NUT1_DSG,NUT2_DSG,NUT3_DSG,AREA_HA,MUNICIPIO,Ilha_1,geometry
0,4301,ILHA TERCEIRA (AÇORES),19,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,23899.66,ANGRA DO HEROÍSMO,,"POLYGON ((476268.433 4294860.227, 476286.962 4..."
1,4302,ILHA TERCEIRA (AÇORES),11,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,16127.06,PRAIA DA VITÓRIA,,"POLYGON ((477610.056 4294950.230, 477609.116 4..."
2,4401,ILHA DA GRACIOSA (AÇORES),4,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,6065.78,SANTA CRUZ DA GRACIOSA,,"POLYGON ((411001.887 4327964.159, 411002.886 4..."
3,4501,ILHA DE SÃO JORGE (AÇORES),5,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,12626.4,CALHETA DE S. JORGE,,"POLYGON ((412172.735 4279930.607, 412193.384 4..."
4,4502,ILHA DE SÃO JORGE (AÇORES),6,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,REGIÃO AUTÓNOMA DOS AÇORES,11738.38,VELAS,,"POLYGON ((385603.838 4290513.349, 385603.808 4..."


In [12]:
geometry_optimized_geojson.to_file(DATA_PATH/"datasets"/"geojson"/"acores_central_concelhos_precision_optimized.geojson", driver="GeoJSON")

In [13]:
geometry_optimized_size = os.path.getsize(DATA_PATH/"datasets"/"geojson"/"acores_central_concelhos_precision_optimized.geojson")
print("Optimized size:", geometry_optimized_size)
print("Decrease %:", round(((geometry_optimized_size/whitespace_optimized_size)-1)*100, 5))

Optimized size: 3455216
Decrease %: 24.88925


## Compress geojson with zip
- geopandas can decompress zip but not gzip

In [14]:
with open(target, 'r') as f_in:
    with zipfile.ZipFile(DATA_PATH/"datasets"/"geojson"/"acores_central_concelhos_compressed_optimized.geojson.zip", mode="w", compression=zipfile.ZIP_DEFLATED) as archive:
        archive.writestr("acores_central_concelhos_compressed_optimized.geojson", f_in.read())

In [15]:
compressed_optimized_size = os.path.getsize(DATA_PATH/"datasets"/"geojson"/"acores_central_concelhos_compressed_optimized.geojson.zip")
print("Optimized size:", compressed_optimized_size)
print("Decrease %:", round(((compressed_optimized_size/raw_size)-1)*100, 5))

Optimized size: 1535482
Decrease %: -59.69199


## Combining methods

In [16]:
def write_optimized_geojson(
    raw_geojson,
    output_file,
    precision=None,
    features=None,
    remove_whitespace=True,
    zip=True,
):

    if not isinstance(output_file, Path):
        raise TypeError("output_file should be an instance of Path")
    geojson = raw_geojson.copy()

    if precision:
        geojson["geometry"] = shapely.set_precision(
            geojson["geometry"], grid_size=precision
        )

    if features:
        geojson = geojson.drop(columns=features)

    if remove_whitespace:
        geojson_string = re.sub(
            r"\s", "", geojson.replace(" ", "_", regex=True).to_json(ensure_ascii=False)
        )
    else:
        geojson_string = geojson.to_json(ensure_ascii=False)

    if zip:
        _zip_name = output_file.with_suffix(output_file.suffix + ".zip")
        with zipfile.ZipFile(
            _zip_name, mode="w", compression=zipfile.ZIP_DEFLATED
        ) as zf:
            zf.writestr(output_file.name, geojson_string)
    else:
        with open(output_file, "w") as text_file:
            text_file.write(geojson_string)

write_optimized_geojson(
    raw_geojson,
    DATA_PATH/"datasets"/"geojson"/"acores_central_concelhos_full_optimized.geojson",
    precision=0.000001,
    features=["Ilha_1", "NUT2_DSG", "NUT3_DSG"]
)

In [17]:
full_optimized_size = os.path.getsize(DATA_PATH/"datasets"/"geojson"/"acores_central_concelhos_full_optimized.geojson.zip")
print("Optimized size:", full_optimized_size)
print("Decrease %:", round(((full_optimized_size/raw_size)-1)*100, 5))

Optimized size: 881574
Decrease %: -76.85776
