In [26]:
import geopandas as gpd
import pandas as pd
import requests
import gzip
import shutil
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

DATA_URL = "https://storage.googleapis.com/open-buildings-data/v3/polygons_s2_level_4_gzip/009_buildings.csv.gz"
DATA_DIR = Path("data")
GZIP_PATH = DATA_DIR / "009_buildings.csv.gz"
CSV_PATH = DATA_DIR / "009_buildings.csv"
CRISTO_COORDS = (-43.2105, -22.9519)  # Cristo Redentor coordinates

def download_file(url, output_path, chunk_size=1024):
    response = requests.get(url, stream=True)
    response.raise_for_status()
    with open(output_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=chunk_size):
            f.write(chunk)

def extract_gzip(input_path, output_path):
    with gzip.open(input_path, 'rb') as f_in:
        with open(output_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

def download_and_extract():
    DATA_DIR.mkdir(exist_ok=True)
    with ThreadPoolExecutor() as executor:
        if not GZIP_PATH.exists():
            executor.submit(download_file, DATA_URL, GZIP_PATH)

        if not CSV_PATH.exists():
            executor.submit(extract_gzip, GZIP_PATH, CSV_PATH)

def load_data_to_geodataframe(csv_path):
    df = pd.read_csv(csv_path, usecols=['longitude',
                                        'latitude',
                                        'full_plus_code',
                                        'area_in_meters',
                                        'confidence'])
    gdf = gpd.GeoDataFrame(df,
                           geometry=gpd.points_from_xy(df.longitude, df.latitude),
                           crs="EPSG:4326")
    return gdf

def calculate_distances(gdf, target_coords):
    target_point = gpd.GeoSeries([gpd.points_from_xy([target_coords[0]],
                                                     [target_coords[1]])[0]],
                                  crs="EPSG:4326")
    gdf = gdf.to_crs("EPSG:3857")
    target_point = target_point.to_crs("EPSG:3857")
    gdf['distance'] = gdf.geometry.distance(target_point.iloc[0])
    return gdf

def get_closest_building(gdf):
    closest_building = gdf.loc[gdf['distance'].idxmin()]
    return closest_building['full_plus_code']

def pipeline():
    # Extract
    download_and_extract()

    # Transform
    gdf = load_data_to_geodataframe(CSV_PATH)
    gdf = calculate_distances(gdf, CRISTO_COORDS)

    # Load
    building_code = get_closest_building(gdf)
    print(f"Closest building code: {building_code}")

# The pipeline
pipeline()


Closest building code: 589R2QXQ+6JRW
