In [1]:
import polars as pl
import os

# Récupérations des données

## Etape de scrapping (Facultative)

In [2]:
!python script_scrapping_dvf.py

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

or by setting POLARS_ALLOW_FORKING_THREAD=1.

  pid, fd = os.forkpty()


Fetching data for year: 2019
File already exists: 01.csv.gz
File already exists: 02.csv.gz
File already exists: 03.csv.gz
File already exists: 04.csv.gz
File already exists: 05.csv.gz
File already exists: 06.csv.gz
File already exists: 07.csv.gz
File already exists: 08.csv.gz
File already exists: 09.csv.gz
File already exists: 10.csv.gz
File already exists: 11.csv.gz
File already exists: 12.csv.gz
File already exists: 13.csv.gz
File already exists: 14.csv.gz
File already exists: 15.csv.gz
File already exists: 16.csv.gz
File already exists: 17.csv.gz
File already exists: 18.csv.gz
File already exists: 19.csv.gz
File already exists: 21.csv.gz
File already exists: 22.csv.gz
File already exists: 23.csv.gz
File already exists: 24.csv.gz
File already exists: 25.csv.gz
File already exists: 26.csv.gz
File already exists: 27.csv.gz
File already exists: 28.csv.gz
File already exists: 29.csv.gz
File already exists: 2A.csv.gz
File already exists: 2B.csv.gz
File already exists: 30.csv.gz
File alrea

## Chargement dans un DataFrame

### Fonctions

In [31]:
def columns_name_and_type(dataframe):
    """
    Crée un dictionnaire avec pour clé le nom et pour valeur le type

    Parameters:
        dataframe (pl.DataFrame): Le DataFrame Polars à analyser.

    Returns:
        dict: Un dictionnaire où les clés sont les noms des colonnes,
              et les valeurs leur type.
    """
    dict_name_type = {}
    for column_name, column_type in dataframe.schema.items():
        # Convertir le type en chaîne de caractères pour l'utiliser comme clé
        dict_name_type[column_name] = column_type
    return dict_name_type

def change_column_types(df, columns_dtypes):
    """
    Change les types de plusieurs colonnes dans un DataFrame Polars.

    Parameters:
        df (pl.DataFrame): Le DataFrame Polars.
        columns_dtypes (dict): Un dictionnaire où les clés sont les noms de colonnes
                               et les valeurs sont les nouveaux types (ex: pl.Int64, pl.Float64).

    Returns:
        pl.DataFrame: Un nouveau DataFrame avec les colonnes modifiées.
    """
    return df.with_columns([
        pl.col(col).cast(dtype) for col, dtype in columns_dtypes.items()
    ]).with_columns([pl.col(col).cast(pl.Float64) for col in df.columns if df[col].dtype == pl.Int64])

def data_loader(path, departements = [], annees = []):

    df = pl.DataFrame()
    if not annees:
        annees_list = os.listdir(path)
    else:
        annees_list =[str(annee) for annee in annees]
    for annee in annees_list:
        cur_year = os.path.join(path,annee)
        if not departements:
            departements = os.listdir(cur_year)
        else:
            departements_list = [f"{departement}.csv.gz" if departement>9 else f"0{departement}.csv.gz" for departement in departements]
        for departement in departements_list:
            file = os.path.join(cur_year,departement)
            temp_df = pl.read_csv(file,ignore_errors=True)
            if df.is_empty():
                column_by_type = columns_name_and_type(temp_df)
            temmp_df = change_column_types(temp_df,column_by_type)
            df = pl.concat([df, temp_df])

    return df


In [36]:
path = 'data_dvf'
df_2023 = pl.read_csv('data_dvf/2023/75.csv.gz',ignore_errors=True)
dict_2023 = columns_name_and_type(df_2023)
df_2023 = change_column_types(df_2023,dict_2023)
dict_2023 = columns_name_and_type(df_2023)
dict_2023

{'id_mutation': String,
 'date_mutation': String,
 'numero_disposition': Float64,
 'nature_mutation': String,
 'valeur_fonciere': Float64,
 'adresse_numero': Float64,
 'adresse_suffixe': String,
 'adresse_nom_voie': String,
 'adresse_code_voie': String,
 'code_postal': Float64,
 'code_commune': Float64,
 'nom_commune': String,
 'code_departement': Float64,
 'ancien_code_commune': String,
 'ancien_nom_commune': String,
 'id_parcelle': String,
 'ancien_id_parcelle': String,
 'numero_volume': String,
 'lot1_numero': Float64,
 'lot1_surface_carrez': Float64,
 'lot2_numero': Float64,
 'lot2_surface_carrez': Float64,
 'lot3_numero': Float64,
 'lot3_surface_carrez': String,
 'lot4_numero': Float64,
 'lot4_surface_carrez': String,
 'lot5_numero': Float64,
 'lot5_surface_carrez': String,
 'nombre_lots': Float64,
 'code_type_local': Float64,
 'type_local': String,
 'surface_reelle_bati': Float64,
 'nombre_pieces_principales': Float64,
 'code_nature_culture': String,
 'nature_culture': String,
 '

In [37]:
path = 'data_dvf'
df_2022 = pl.read_csv('data_dvf/2022/75.csv.gz',ignore_errors=True)
dict_2022 = columns_name_and_type(df_2022)
df_copy_2022 =change_column_types(df_2022,dict_2023)
dict_copy_2022 = columns_name_and_type(df_copy_2022)

In [38]:
test = pl.from_dicts([dict_2023,dict_copy_2022])
test

id_mutation,date_mutation,numero_disposition,nature_mutation,valeur_fonciere,adresse_numero,adresse_suffixe,adresse_nom_voie,adresse_code_voie,code_postal,code_commune,nom_commune,code_departement,ancien_code_commune,ancien_nom_commune,id_parcelle,ancien_id_parcelle,numero_volume,lot1_numero,lot1_surface_carrez,lot2_numero,lot2_surface_carrez,lot3_numero,lot3_surface_carrez,lot4_numero,lot4_surface_carrez,lot5_numero,lot5_surface_carrez,nombre_lots,code_type_local,type_local,surface_reelle_bati,nombre_pieces_principales,code_nature_culture,nature_culture,code_nature_culture_speciale,nature_culture_speciale,surface_terrain,longitude,latitude
object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object
String,String,Float64,String,Float64,Float64,String,String,String,Float64,Float64,String,Float64,String,String,String,String,String,Float64,Float64,Float64,Float64,Float64,String,Float64,String,Float64,String,Float64,Float64,String,Float64,Float64,String,String,String,String,String,Float64,Float64
String,String,Float64,String,Float64,Float64,String,String,Float64,Float64,Float64,String,Float64,String,String,String,String,String,Float64,Float64,Float64,Float64,Float64,String,Float64,String,Float64,String,Float64,Float64,String,Float64,Float64,String,String,String,String,String,Float64,Float64


In [41]:
df_copy_2022 = df_copy_2022.with_columns(pl.col('adesse_code_voie').cast(pl.String))


ColumnNotFoundError: adesse_code_voie

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'with_columns' <---
DF ["id_mutation", "date_mutation", "numero_disposition", "nature_mutation"]; PROJECT */40 COLUMNS; SELECTION: None