# Conversion des bases en geoparquet

## 1. La base dvf

In [None]:
import requests
import os

url = "https://files.data.gouv.fr/geo-dvf/latest/csv/2022/full.csv.gz"
file_name = "dvf.csv.gz"

# Check if the file already exists
if not os.path.exists(file_name):
    response = requests.get(url)

    if response.status_code == 200:
        with open(file_name, "wb") as f:
            f.write(response.content)
        print("Téléchargement réussi.")
    else:
        print(f"Échec du téléchargement. Code d'état : {response.status_code}")
else:
    print(f"Le fichier '{file_name}' existe déjà. Aucun téléchargement nécessaire.")

In [None]:
import pandas as pd
import geopandas as gpd
dvf = pd.read_csv("dvf.csv.gz", dtype={'code_commune': "str", "code_departement": "str"})
gdf = gpd.GeoDataFrame(
    dvf,
    geometry=gpd.points_from_xy(x=dvf.longitude, y=dvf.latitude)
)
gdf.set_crs(epsg=4326)
object_cols = gdf.select_dtypes(['object']).columns
gdf[object_cols] = gdf[object_cols].astype('string')


In [None]:
gdf.to_parquet("dvf.parquet")

In [None]:
import duckdb
duckdb.execute("INSTALL spatial;")
duckdb.execute("LOAD spatial;")

In [None]:
duckdb.sql("SELECT * FROM read_parquet('dvf.parquet')")

In [None]:
reference_lon = 2.3602  # Replace with your reference longitude
reference_lat = 48.9245 # Replace with your reference latitude

# Approximate 1 km in degrees (0.009 degrees latitude/longitude)
buffer_distance_deg = 0.009

# Updated query with approximate filtering using bounding box method
toto = duckdb.sql(
    f"""
    WITH bbox_points AS (
        SELECT *,
               ST_Point(longitude, latitude) AS point_geom
        FROM read_parquet('dvf.parquet')
    )
    SELECT *
    FROM bbox_points
    WHERE
        AND bbox.xmin BETWEEN {reference_lon - buffer_distance_deg} AND {reference_lon + buffer_distance_deg}
        AND bbox.ymin BETWEEN {reference_lat - buffer_distance_deg} AND {reference_lat + buffer_distance_deg}
    """
).to_df()

df = gpd.GeoDataFrame(
    toto.drop("geometry", axis = "columns"), geometry=gpd.points_from_xy(toto.LONGITUDE, toto.LATITUDE), crs="EPSG:4326"
)


AttributeError: 'DataFrame' object has no attribute 'LONGITUDE'

In [None]:

duckdb.sql(f'CREATE OR REPLACE VIEW dvf AS SELECT * FROM read_parquet("dvf.parquet")')

In [None]:
duckdb.sql("SELECT * FROM dvf LIMIT 5").df().head(2)