In [1]:
import pandas as pd
import numpy as np
from multiprocessing.pool import ThreadPool
from threading import Thread

In [2]:
# Downloading the data
data = {}

URLS = [("2018", "https://www.data.gouv.fr/fr/datasets/r/1be77ca5-dc1b-4e50-af2b-0240147e0346"),
        ("2019", "https://www.data.gouv.fr/fr/datasets/r/3004168d-bec4-44d9-a781-ef16f41856a2"),
        ("2020", "https://www.data.gouv.fr/fr/datasets/r/90a98de0-f562-4328-aa16-fe0dd1dca60f"),
        ("2021", "https://www.data.gouv.fr/fr/datasets/r/817204ac-2202-4b4a-98e7-4184d154d98c"),
        ("Q12022", "https://www.data.gouv.fr/fr/datasets/r/87038926-fb31-4959-b2ae-7a24321c599a")]

In [3]:
# Async download data for a quicker result.
def download_data(year: str, url: str):
    return year, pd.read_csv(url, sep='|', decimal=',')
    
with ThreadPool(len(URLS)) as pool:
    results = pool.starmap_async(download_data, URLS)
    for result in results.get():
        year = result[0]
        dataframe = result[1]
        data[year] = dataframe

  return year, pd.read_csv(url, sep='|', decimal=',')
  return year, pd.read_csv(url, sep='|', decimal=',')
  return year, pd.read_csv(url, sep='|', decimal=',')
  return year, pd.read_csv(url, sep='|', decimal=',')
  return year, pd.read_csv(url, sep='|', decimal=',')


In [4]:
# Check that all dataframes have the same structure.
columns = [set(df.columns) for df in data.values()]

from itertools import groupby

def all_equal(iterable):
    g = groupby(iterable)
    return next(g, True) and not next(g, False)

all_equal(columns)

True

In [5]:
# Create one dataframe:
dataframes = list(data.values())
df = dataframes[0]
for next_dataframe in dataframes[1:]:
    df = pd.concat([df, next_dataframe])
display(df)

Unnamed: 0,Identifiant de document,Reference document,1 Articles CGI,2 Articles CGI,3 Articles CGI,4 Articles CGI,5 Articles CGI,No disposition,Date mutation,Nature mutation,...,Surface Carrez du 5eme lot,Nombre de lots,Code type local,Type local,Identifiant local,Surface reelle bati,Nombre pieces principales,Nature culture,Nature culture speciale,Surface terrain
0,,,,,,,,1,03/01/2018,Vente,...,,2,2.0,Appartement,,73.0,4.0,,,
1,,,,,,,,1,03/01/2018,Vente,...,,1,3.0,Dépendance,,0.0,0.0,,,
2,,,,,,,,1,04/01/2018,Vente,...,,0,1.0,Maison,,163.0,4.0,S,,949.0
3,,,,,,,,1,04/01/2018,Vente,...,,0,1.0,Maison,,163.0,4.0,AG,JARD,420.0
4,,,,,,,,1,04/01/2018,Vente,...,,0,1.0,Maison,,51.0,2.0,AG,JARD,420.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3803880,,,,,,,,1,30/12/2022,Vente,...,,1,4.0,Local industriel. commercial ou assimilé,,327.0,0.0,,,
3803881,,,,,,,,1,19/12/2022,Vente,...,,2,3.0,Dépendance,,0.0,0.0,,,
3803882,,,,,,,,1,19/12/2022,Vente,...,,2,2.0,Appartement,,40.0,3.0,,,
3803883,,,,,,,,1,30/12/2022,Vente,...,,1,3.0,Dépendance,,0.0,0.0,,,


In [6]:
# Check for empty columns or columns with lots of Nans. Don't drop Data above the lot and lot size because we will reformat them later.
def drop_empty_cols(df):
    len_df = len(df)
    for col in df:
        if df[col].isna().sum() / len_df > 0.5:
            print(f"{col} have too much missing data. Dropping it.")
            df = df.drop(col, axis=1)
    return df

df = drop_empty_cols(df)
display(df.isna().sum() / len(df))

Identifiant de document have too much missing data. Dropping it.
Reference document have too much missing data. Dropping it.
1 Articles CGI have too much missing data. Dropping it.
2 Articles CGI have too much missing data. Dropping it.
3 Articles CGI have too much missing data. Dropping it.
4 Articles CGI have too much missing data. Dropping it.
5 Articles CGI have too much missing data. Dropping it.
B/T/Q have too much missing data. Dropping it.
Prefixe de section have too much missing data. Dropping it.
No Volume have too much missing data. Dropping it.
1er lot have too much missing data. Dropping it.
Surface Carrez du 1er lot have too much missing data. Dropping it.
2eme lot have too much missing data. Dropping it.
Surface Carrez du 2eme lot have too much missing data. Dropping it.
3eme lot have too much missing data. Dropping it.
Surface Carrez du 3eme lot have too much missing data. Dropping it.
4eme lot have too much missing data. Dropping it.
Surface Carrez du 4eme lot have too

No disposition               0.000000
Date mutation                0.000000
Nature mutation              0.000000
Valeur fonciere              0.010080
No voie                      0.384567
Type de voie                 0.403898
Code voie                    0.008857
Voie                         0.008877
Code postal                  0.008897
Commune                      0.000000
Code departement             0.000000
Code commune                 0.000000
Section                      0.000034
No plan                      0.000000
Nombre de lots               0.000000
Code type local              0.425891
Type local                   0.425891
Surface reelle bati          0.426556
Nombre pieces principales    0.426556
Nature culture               0.320443
Surface terrain              0.320443
dtype: float64

In [7]:
df = df.drop("No voie", axis=1).dropna()

In [8]:
df["Code departement"] = df["Code departement"].astype(str)

# Map the column Code Voie using Code Rivoli
def parse_rivoli(value: str | float):
    value = str(value)
    first_char = value[0].lower()
    if first_char.isnumeric():
        return "Voie"
    elif first_char == "a":
        return "Ensemble immobilier"
    elif "b" <= first_char <= "w":
        return "Lieux dit"
    elif first_char == "x":
        return "Pseudo voie"
    elif "y" <= first_char <= "z":
        return "Voie provisoire"
    else:
        raise Exception(f"Unkwnown code: {first_char}")
    
df["Code voie"] = df["Code voie"].map(parse_rivoli)

df.to_csv("cleaned_data.csv", index=False)
del df