à partir de tous les datasets dispos (5 dernières années), on veut nettoyer et rassembler les données.

Voici la liste des datasets à récupérer :
* https://files.data.gouv.fr/geo-dvf/latest/csv/2017/full.csv.gz
* https://files.data.gouv.fr/geo-dvf/latest/csv/2018/full.csv.gz
* https://files.data.gouv.fr/geo-dvf/latest/csv/2019/full.csv.gz
* https://files.data.gouv.fr/geo-dvf/latest/csv/2020/full.csv.gz
* https://files.data.gouv.fr/geo-dvf/latest/csv/2021/full.csv.gz


## Téléchargement des datasets

In [1]:
from pathlib import Path
import requests
from tqdm import tqdm

BASE_URL = "https://files.data.gouv.fr/geo-dvf/latest/csv"
BASE_FILENAME = "full"
EXTENSION = ".csv.gz"

# url dataset example : 
# https://files.data.gouv.fr/geo-dvf/latest/csv/2017/full.csv.gz
def create_url(year: int) -> str:
    return f"{BASE_URL}/{year}/{BASE_FILENAME}{EXTENSION}"

YEARS = range(2017, 2022)
DATASET_FOLDER_PATH = Path("./dvf")
DATASET_FOLDER_PATH.mkdir(parents=True, exist_ok=True)

def download_file(url: str, destination: Path):
    response = requests.get(url)
    with destination.open("wb") as file:
        file.write(response.content)

In [2]:
for year in tqdm(YEARS, desc="Download csv files"):
    url = create_url(year)
    output_file_path = DATASET_FOLDER_PATH / f"{year}{EXTENSION}"
    download_file(url, output_file_path)
    
print()
print(f"CSV files : {[p.as_posix() for p in DATASET_FOLDER_PATH.rglob('*')]}")

Download csv files: 100%|██████████| 5/5 [00:21<00:00,  4.34s/it]


CSV files : ['dvf/2017.csv.gz', 'dvf/2021.csv.gz', 'dvf/2018.csv.gz', 'dvf/2019.csv.gz', 'dvf/2020.csv.gz']





## Chargement et fusion des datasets en un dataset unique

Le chargement des datasets en mémoire directement n'est pas possible (trop de RAM nécéssaire)

Il faut donc nettoyer fichier par fichier avant de fusionner les datasets

In [14]:
import pandas as pd
import numpy as np

def load_dataset(path: Path) -> pd.DataFrame:
    print(f"==== Load dataset from csv {path.as_posix()} ====")
    return pd.read_csv(path, low_memory=False)

def remove_type_locals(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(df[(
        (df["type_local"] == "Dépendance")
        | (df["type_local"] == "Local industriel. commercial ou assimilé")
    )].index)

def remove_useless_columns(df: pd.DataFrame) -> pd.DataFrame:
    useless_columns = [
        "ancien_code_commune",
        "ancien_nom_commune",
        "ancien_id_parcelle",
        "numero_volume",
        "lot1_numero",
        "lot1_surface_carrez",
        "lot2_numero",
        "lot2_surface_carrez",
        "lot3_numero",
        "lot3_surface_carrez",
        "lot4_numero",
        "lot4_surface_carrez",
        "lot5_numero",
        "lot5_surface_carrez",
        "code_nature_culture",
        "nature_culture",
        "code_nature_culture_speciale",
        "nature_culture_speciale"
    ]
    print("Remove useless columns")
    return df.drop(useless_columns, axis="columns")

def remove_rows_without_required_data(df: pd.DataFrame) -> pd.DataFrame:
    necessary_columns = [
        "date_mutation",
        "nature_mutation",
        "valeur_fonciere",
        "type_local",
        "surface_reelle_bati",
        "nombre_pieces_principales"
    ]
    
    def are_necessary_columns_all_present(row: pd.Series) -> bool:
        return all(pd.notna(row[nc]) for nc in necessary_columns)
    
    print("Remove rows without required data")
    return df.drop(
        df[~df.apply(are_necessary_columns_all_present, axis=1)].index
    )

def remove_expensive_mutations(df: pd.DataFrame) -> pd.DataFrame:
    print("Remove expensive mutations")
    return df[df["valeur_fonciere"] <= 10_000_000]

def remove_duplicate_id_mutation(df: pd.DataFrame) -> pd.DataFrame:
    print("Remove duplicate mutation ids")
    columns = df.columns.tolist()
    gen = (group.iloc[-1] 
           for _, group in tqdm(df.groupby("id_mutation"), 
                                desc="Remove group duplicates")
    )
    return pd.DataFrame(gen, columns=columns)

def arrange_area_values(df: pd.DataFrame) -> pd.DataFrame:
    print("Arrange area values")
    df.loc[df["surface_terrain"].isna(), "surface_terrain"] = 0.0
    return df

def cast_columns(df: pd.DataFrame) -> pd.DataFrame:
    print("Cast columns")
    df["valeur_fonciere"] = df["valeur_fonciere"].astype(int)
    # Not all rows have a postal code, but they do have a city code (INSEE)
    df["code_postal"] = df["code_postal"].apply(lambda c: f"{int(c):05d}" if pd.notna(c) else "00000")
    df["code_type_local"] = df["code_type_local"].astype(int)
    df["nombre_pieces_principales"] = df["nombre_pieces_principales"].astype(int)
    return df

def save_dataframe(df: pd.DataFrame, source_path: Path, dest_folder: Path):
    basename = source_path.stem.split('.')[0]
    dest_file = dest_folder / f"{basename}_cleaned.csv"

    print(f"Save dataframe to {dest_file.as_posix()}")
    df.to_csv(dest_file)

In [15]:
# Composition time !
def clean_dataframe(df: pd.DataFrame, source_path: Path, dest_folder: Path) -> pd.DataFrame:
    # "pipe" method allows to chain functions on entire dataframe
    # (where "apply" expect functions used for each row / column)
    print(f"==== Clean dataframe from source {source_path.as_posix()} ====")
    df = (df.pipe(remove_type_locals)
        .pipe(remove_useless_columns)
        .pipe(remove_rows_without_required_data)
        .pipe(remove_expensive_mutations)
        .pipe(remove_duplicate_id_mutation)
        .pipe(arrange_area_values)
        .pipe(cast_columns)
    )
    if source_path is not None and dest_folder is not None:
        save_dataframe(df, source_path, dest_folder)
    return df

In [16]:
CLEANED_FOLDER_PATH = Path("./dvf_cleaned")
CLEANED_FOLDER_PATH.mkdir(parents=True, exist_ok=True)

datasets_with_path = ((load_dataset(p), p) for p in DATASET_FOLDER_PATH.rglob('*'))
cleaned_datasets = (clean_dataframe(df, p, CLEANED_FOLDER_PATH) 
                    for df, p in datasets_with_path
)

dataset = pd.concat(cleaned_datasets, axis=0, ignore_index=True)
dataset

==== Load dataset from csv dvf/2017.csv.gz ====
==== Clean dataframe from source dvf/2017.csv.gz ====
Remove useless columns
Remove rows without required data
Remove expensive mutations
Remove duplicate mutation ids


Remove group duplicates: 100%|██████████| 968324/968324 [03:02<00:00, 5316.58it/s]


Arrange area values
Cast columns
Save dataframe to dvf_cleaned/2017_cleaned.csv
==== Load dataset from csv dvf/2021.csv.gz ====
==== Clean dataframe from source dvf/2021.csv.gz ====
Remove useless columns
Remove rows without required data
Remove expensive mutations
Remove duplicate mutation ids


Remove group duplicates: 100%|██████████| 861179/861179 [02:36<00:00, 5516.25it/s]


Arrange area values
Cast columns
Save dataframe to dvf_cleaned/2021_cleaned.csv
==== Load dataset from csv dvf/2018.csv.gz ====
==== Clean dataframe from source dvf/2018.csv.gz ====
Remove useless columns
Remove rows without required data
Remove expensive mutations
Remove duplicate mutation ids


Remove group duplicates: 100%|██████████| 964587/964587 [02:59<00:00, 5361.14it/s]


Arrange area values
Cast columns
Save dataframe to dvf_cleaned/2018_cleaned.csv
==== Load dataset from csv dvf/2019.csv.gz ====
==== Clean dataframe from source dvf/2019.csv.gz ====
Remove useless columns
Remove rows without required data
Remove expensive mutations
Remove duplicate mutation ids


Remove group duplicates: 100%|██████████| 1044237/1044237 [03:12<00:00, 5430.38it/s]


Arrange area values
Cast columns
Save dataframe to dvf_cleaned/2019_cleaned.csv
==== Load dataset from csv dvf/2020.csv.gz ====
==== Clean dataframe from source dvf/2020.csv.gz ====
Remove useless columns
Remove rows without required data
Remove expensive mutations
Remove duplicate mutation ids


Remove group duplicates: 100%|██████████| 988507/988507 [03:08<00:00, 5250.53it/s]


Arrange area values
Cast columns
Save dataframe to dvf_cleaned/2020_cleaned.csv


Unnamed: 0,id_mutation,date_mutation,numero_disposition,nature_mutation,valeur_fonciere,adresse_numero,adresse_suffixe,adresse_nom_voie,adresse_code_voie,code_postal,...,code_departement,id_parcelle,nombre_lots,code_type_local,type_local,surface_reelle_bati,nombre_pieces_principales,surface_terrain,longitude,latitude
0,2017-1,2017-01-02,1,Vente,27000,83.0,,RUE CHARLES ROBIN,0820,01000,...,01,01053000BK0039,2,2,Appartement,37.0,2,0.0,5.234440,46.206155
1,2017-10,2017-01-04,1,Vente,177000,26.0,,RUE MAL FOCH,2520,01000,...,01,01053000AD0198,0,2,Appartement,38.0,1,70.0,5.228088,46.205262
2,2017-1000,2017-04-20,1,Vente,268000,5619.0,,BELLEGARDE,B006,01160,...,01,013140000D2020,0,1,Maison,97.0,4,800.0,5.294106,46.014361
3,2017-10000,2017-12-20,1,Vente,196000,5131.0,,NECUIDAZ,B067,01510,...,01,013720000C0396,0,1,Maison,174.0,6,700.0,5.689796,45.840059
4,2017-100000,2017-02-20,1,Vente,196000,15.0,,RES L'AUZINA,A004,11290,...,11,110050000C0940,0,1,Maison,138.0,4,736.0,2.249574,43.183085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4826829,2020-999995,2020-11-24,1,Vente,180000,8.0,,IMP DES PETITS SOULIERS,1090,71450,...,71,71040000AP0154,3,2,Appartement,75.0,3,0.0,4.392410,46.695394
4826830,2020-999996,2020-11-30,1,Vente,34500,5070.0,,LE BOURG,B030,71540,...,71,712660000B0168,0,1,Maison,80.0,3,113.0,4.246966,47.082180
4826831,2020-999997,2020-11-30,1,Vente,41000,18.0,,RUE PIERRE MENDES FRANCE,3717,71200,...,71,71153000AD0412,2,2,Appartement,77.0,4,0.0,4.419442,46.804846
4826832,2020-999998,2020-11-30,1,Vente,58500,2.0,,AV CHARLES DE GAULLE,0990,71200,...,71,71153000AI0100,2,2,Appartement,66.0,3,0.0,4.432231,46.808550


In [17]:
MERGED_DATASET_PATH = Path("./dvf_merged.csv.gz")
dataset.to_csv(MERGED_DATASET_PATH, compression="gzip")

In [20]:
!cd dvf_cleaned
!gzip dvf_cleaned/2017_cleaned.csv \
    dvf_cleaned/2018_cleaned.csv \
    dvf_cleaned/2019_cleaned.csv \
    dvf_cleaned/2020_cleaned.csv \
    dvf_cleaned/2021_cleaned.csv -v

gzip: dvf_cleaned/2017_cleaned.csv: No such file or directory
gzip: dvf_cleaned/2018_cleaned.csv: No such file or directory
gzip: dvf_cleaned/2019_cleaned.csv: No such file or directory
gzip: dvf_cleaned/2020_cleaned.csv: No such file or directory
dvf_cleaned/2021_cleaned.csv:	 71.7% -- replaced with dvf_cleaned/2021_cleaned.csv.gz


In [21]:
# Download all files
!zip -r dvf_cleaned.zip dvf_cleaned dvf_merged.csv.gz
from google.colab import files
files.download("dvf_cleaned.zip")

  adding: dvf_cleaned/ (stored 0%)
  adding: dvf_cleaned/2021_cleaned.csv.gz (deflated 0%)
  adding: dvf_cleaned/2018_cleaned.csv.gz (deflated 0%)
  adding: dvf_cleaned/2020_cleaned.csv.gz (deflated 0%)
  adding: dvf_cleaned/2019_cleaned.csv.gz (deflated 0%)
  adding: dvf_cleaned/2017_cleaned.csv.gz (deflated 0%)
  adding: dvf_merged.csv.gz (deflated 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>