## P3 - Entraînez-vous avec SQL et créez votre BDD

### Importation des librairies

In [1033]:
import polars as pl
import re
from unidecode import unidecode

### Fonctions

In [1034]:
def clean_column_names(df: pl.DataFrame) -> pl.DataFrame:
    '''
    Define a function to apply regex and transformations on column names
    '''
    def clean_name(col: str) -> str:
        # Replace spaces with underscores, manage uppercase transitions, and handle non-ASCII chars
        col = re.sub(r'(?<!^)(?=[A-Z])|(?<=_)(?=[A-Z])| |-', lambda x: "_" if x.group(0) == " " else "", unidecode(col.lower()))
        # Replace any double underscores created by prior replacements
        col = col.replace("'", "_").replace("__", "_")
        return col
    
    # Apply renaming to each column
    return df.rename({col: clean_name(col) for col in df.columns})

### Variables

In [1035]:
FICHIER_COMMUNE = './data/donnees_communes.xlsx'

FICHIER_GEO = './data/fr-esr-referentiel-geographique.xlsx'

FICHIER_IMMO = './data/Valeurs-foncières.xlsx'

### Importations des fichiers

In [1036]:
df_com = pl.read_excel(FICHIER_COMMUNE)

df_geo = pl.read_excel(FICHIER_GEO)

df_immo = pl.read_excel(FICHIER_IMMO)

## Preparation des fichiers csv

In [1037]:
# Standardisation du nom des colonnes:
df_com = clean_column_names(df_com)

df_geo = clean_column_names(df_geo)

df_immo = clean_column_names(df_immo)

In [1038]:
df_com.head

<bound method DataFrame.head of shape: (34_991, 9)
┌────────┬────────┬────────┬────────┬───┬─────────────────────────┬───────┬──────┬───────┐
│ codreg ┆ coddep ┆ codarr ┆ codcan ┆ … ┆ com                     ┆ pmun  ┆ pcap ┆ ptot  │
│ ---    ┆ ---    ┆ ---    ┆ ---    ┆   ┆ ---                     ┆ ---   ┆ ---  ┆ ---   │
│ i64    ┆ str    ┆ str    ┆ str    ┆   ┆ str                     ┆ i64   ┆ i64  ┆ i64   │
╞════════╪════════╪════════╪════════╪═══╪═════════════════════════╪═══════╪══════╪═══════╡
│ 84     ┆ 01     ┆ 02     ┆ 08     ┆ … ┆ L'Abergement-Clémenciat ┆ 779   ┆ 19   ┆ 798   │
│ 84     ┆ 01     ┆ 01     ┆ 01     ┆ … ┆ L'Abergement-de-Varey   ┆ 256   ┆ 1    ┆ 257   │
│ 84     ┆ 01     ┆ 01     ┆ 01     ┆ … ┆ Ambérieu-en-Bugey       ┆ 14134 ┆ 380  ┆ 14514 │
│ 84     ┆ 01     ┆ 02     ┆ 22     ┆ … ┆ Ambérieux-en-Dombes     ┆ 1751  ┆ 25   ┆ 1776  │
│ 84     ┆ 01     ┆ 01     ┆ 04     ┆ … ┆ Ambléon                 ┆ 112   ┆ 6    ┆ 118   │
│ …      ┆ …      ┆ …      ┆ …      ┆ …

In [1039]:
df_geo.head

<bound method DataFrame.head of shape: (38_916, 37)
┌────────────┬────────────┬────────────┬─────────┬───┬────────┬────────────┬──────────┬────────────┐
│ regrgp_nom ┆ reg_nom    ┆ reg_nom_ol ┆ aca_nom ┆ … ┆ auc_id ┆ auc_nom    ┆ uu_id_10 ┆ geolocalis │
│ ---        ┆ ---        ┆ d          ┆ ---     ┆   ┆ ---    ┆ ---        ┆ ---      ┆ ation      │
│ str        ┆ str        ┆ ---        ┆ str     ┆   ┆ str    ┆ str        ┆ str      ┆ ---        │
│            ┆            ┆ str        ┆         ┆   ┆        ┆            ┆          ┆ str        │
╞════════════╪════════════╪════════════╪═════════╪═══╪════════╪════════════╪══════════╪════════════╡
│ Province   ┆ Auvergne-R ┆ Rhône-Alpe ┆ Lyon    ┆ … ┆ C01001 ┆ L'Abergeme ┆ SO       ┆ 46.1534255 │
│            ┆ hône-Alpes ┆ s          ┆         ┆   ┆        ┆ nt-Clémenc ┆          ┆ 214,4.9261 │
│            ┆            ┆            ┆         ┆   ┆        ┆ iat        ┆          ┆ 1354223    │
│ Province   ┆ Auvergne-R ┆ Rhône-Alpe 

In [1040]:
df_immo.head

<bound method DataFrame.head of shape: (34_169, 46)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ code_serv ┆ reference ┆ 1_article ┆ 2_article ┆ … ┆ nature_cu ┆ nature_cu ┆ surface_t ┆ nom_de_l │
│ ice_ch    ┆ _document ┆ s_cgi     ┆ s_cgi     ┆   ┆ lture     ┆ lture_spe ┆ errain    ┆ _acquere │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ciale     ┆ ---       ┆ ur       │
│ null      ┆ null      ┆ null      ┆ null      ┆   ┆ str       ┆ ---       ┆ i64       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆ null      ┆           ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ null      ┆ null      ┆ null      ┆ null      ┆ … ┆ null      ┆ null      ┆ null      ┆ GUIRAO   │
│ null      ┆ null      ┆ null      ┆ null      ┆ … ┆ null      ┆ null      ┆ null      ┆ HARNOIS  │
│ null      ┆ null      ┆ null      ┆ n

In [1041]:
df_immo.null_count()

code_service_ch,reference_document,1_articles_cgi,2_articles_cgi,3_articles_cgi,4_articles_cgi,5_articles_cgi,no_disposition,date_mutation,nature_mutation,valeur_fonciere,no_voie,b/t/q,code_type_de_voie,type_de_voie,code_voie,voie,code_id_commune,code_postal,commune,code_departement,code_commune,prefixe_de_section,section,no_plan,no_volume,1er_lot,surface_carrez_du_1er_lot,2eme_lot,surface_carrez_du_2eme_lot,3eme_lot,surface_carrez_du_3eme_lot,4eme_lot,surface_carrez_du_4eme_lot,5eme_lot,surface_carrez_du_5eme_lot,nombre_de_lots,code_type_local,type_local,identifiant_local,surface_reelle_bati,nombre_pieces_principales,nature_culture,nature_culture_speciale,surface_terrain,nom_de_l_acquereur
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
34169,34169,34169,34169,34169,34169,34169,0,0,0,18,133,31995,0,940,0,0,0,0,0,0,0,33026,1,0,34169,5,0,34169,34169,34169,34169,34169,34169,34169,34169,0,0,0,34169,0,0,33916,34169,33916,0


In [1042]:
df_immo = df_immo.rename({"b/t/q":"btq"})

In [1043]:
df_immo = df_immo.with_columns(
    (pl.concat_str([pl.col("code_departement"), pl.col("code_commune")])
     .alias('id_code_depcommune'))
)

In [1044]:
df_immo = df_immo.with_columns(
    pl.col('surface_terrain').fill_null(0).alias('surface_terrain'),
    pl.col('1er_lot').fill_null('').alias('1er_lot')
)

In [1045]:
df_immo = df_immo.with_columns(
    (pl.col('surface_reelle_bati') + pl.col('surface_terrain').fill_null(0))
    .alias('surface_totale')
)

In [1046]:
df_immo = df_immo.with_columns(
    (pl.concat_str([
        pl.col('no_voie').fill_null(''),  
        pl.col('btq').fill_null(''),
        pl.col('type_de_voie').fill_null(''),
        pl.col('voie').fill_null(''),
        pl.col('nombre_pieces_principales').fill_null('').cast(pl.Utf8),
        pl.col('type_local').fill_null(''),
        pl.col('1er_lot'),
        pl.col('surface_carrez_du_1er_lot').fill_null('').cast(pl.Utf8),
        pl.col('surface_totale').fill_null('').cast(pl.Utf8),  
        pl.col('id_code_depcommune').fill_null('')
    ])
    .hash()
    .alias('id_bien'))
)

In [1047]:
print(df_immo.filter(pl.col('id_bien').is_duplicated()).shape)
print(df_immo.filter(df_immo.select(    
    'no_voie',
    'btq',
    'type_de_voie',
    'voie',
    '1er_lot',
    'type_local',
    'nombre_pieces_principales',
    'surface_carrez_du_1er_lot',
    'surface_totale',
    'id_code_depcommune',
    'id_bien').is_duplicated()).shape)

(0, 49)
(0, 49)


In [1048]:
df_immo = df_immo.with_columns(
    (pl.concat_str([
        pl.col('id_bien').cast(pl.Utf8),
        pl.col('date_mutation').cast(pl.Utf8)
    ])
     .hash()
     .alias('id_vente'))
)

In [1049]:
df_immo.filter(pl.col('id_vente').is_duplicated())

code_service_ch,reference_document,1_articles_cgi,2_articles_cgi,3_articles_cgi,4_articles_cgi,5_articles_cgi,no_disposition,date_mutation,nature_mutation,valeur_fonciere,no_voie,btq,code_type_de_voie,type_de_voie,code_voie,voie,code_id_commune,code_postal,commune,code_departement,code_commune,prefixe_de_section,section,no_plan,no_volume,1er_lot,surface_carrez_du_1er_lot,2eme_lot,surface_carrez_du_2eme_lot,3eme_lot,surface_carrez_du_3eme_lot,4eme_lot,surface_carrez_du_4eme_lot,5eme_lot,surface_carrez_du_5eme_lot,nombre_de_lots,code_type_local,type_local,identifiant_local,surface_reelle_bati,nombre_pieces_principales,nature_culture,nature_culture_speciale,surface_terrain,nom_de_l_acquereur,id_code_depcommune,surface_totale,id_bien,id_vente
null,null,null,null,null,null,null,i64,date,str,f64,i64,str,i64,str,str,str,i64,f64,str,str,str,i64,str,i64,null,str,f64,null,null,null,null,null,null,null,null,i64,i64,str,null,i64,i64,str,null,i64,str,str,i64,u64,u64


In [1050]:
df_immo

code_service_ch,reference_document,1_articles_cgi,2_articles_cgi,3_articles_cgi,4_articles_cgi,5_articles_cgi,no_disposition,date_mutation,nature_mutation,valeur_fonciere,no_voie,btq,code_type_de_voie,type_de_voie,code_voie,voie,code_id_commune,code_postal,commune,code_departement,code_commune,prefixe_de_section,section,no_plan,no_volume,1er_lot,surface_carrez_du_1er_lot,2eme_lot,surface_carrez_du_2eme_lot,3eme_lot,surface_carrez_du_3eme_lot,4eme_lot,surface_carrez_du_4eme_lot,5eme_lot,surface_carrez_du_5eme_lot,nombre_de_lots,code_type_local,type_local,identifiant_local,surface_reelle_bati,nombre_pieces_principales,nature_culture,nature_culture_speciale,surface_terrain,nom_de_l_acquereur,id_code_depcommune,surface_totale,id_bien,id_vente
null,null,null,null,null,null,null,i64,date,str,f64,i64,str,i64,str,str,str,i64,f64,str,str,str,i64,str,i64,null,str,f64,null,null,null,null,null,null,null,null,i64,i64,str,null,i64,i64,str,null,i64,str,str,i64,u64,u64
,,,,,,,1,2020-01-02,"""Vente""",165000.0,347,,0,"""RUE""","""20""","""DU CHATEAU""",1,1170.0,"""CHEVRY""","""01""","""103""",,"""A""",302,,"""12""",48.22,,,,,,,,,2,2,"""Appartement""",,48,3,,,0,"""GUIRAO""","""01103""",48,15137228140333276830,9259935209207193945
,,,,,,,1,2020-01-02,"""Vente""",355680.0,4,,15,"""BD""","""1000""","""EDOUARD BAUDOIN""",205,6160.0,"""ANTIBES""","""06""","""004""",,"""CP""",186,,"""132""",39.11,,,,,,,,,2,2,"""Appartement""",,40,1,,,0,"""HARNOIS""","""06004""",40,17560310484389916846,13813125794198133400
,,,,,,,1,2020-01-02,"""Vente""",229500.0,20,"""B""",0,"""RUE""","""3975""","""MARCEAU""",142,6000.0,"""NICE""","""06""","""088""",,"""LS""",169,,"""99""",80.25,,,,,,,,,1,2,"""Appartement""",,82,3,,,0,"""ROGIER""","""06088""",82,9794828718374656783,15600841261221439612
,,,,,,,1,2020-01-02,"""Vente""",125000.0,550,,3,"""RTE""","""1011""","""DES VESPINS RN7""",228,6700.0,"""SAINT LAURENT DU VAR""","""06""","""123""",,"""AO""",348,,"""242""",27.51,,,,,,,,,1,2,"""Appartement""",,27,1,,,0,"""BOCQUIER""","""06123""",27,5313800244361690172,16145551626827266416
,,,,,,,1,2020-01-02,"""Vente""",90000.0,9300,,18,"""RES""","""A084""","""LES ARPEGES BD DES ABA""",326,13400.0,"""AUBAGNE""","""13""","""005""",,"""AW""",224,,"""218""",47.33,,,,,,,,,1,2,"""Appartement""",,47,2,,,0,"""GUILLOSSOU""","""13005""",47,16785843096445396717,5698024270704249500
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
,,,,,,,1,2020-06-30,"""Vente""",115000.0,5,,12,"""PL""","""1375""","""JEAN CHARCOT""",3111,95200.0,"""SARCELLES""","""95""","""585""",,"""BD""",227,,"""48""",59.5,,,,,,,,,1,2,"""Appartement""",,63,3,,,0,"""SAADOUN""","""95585""",63,1393333401999000253,6466178643350014766
,,,,,,,1,2020-06-30,"""Vente""",129000.0,59,,1,"""AV""","""1140""","""DE PARIS""",3149,95230.0,"""SOISY SS MONTMORENCY""","""95""","""598""",,"""AI""",132,,"""121""",55.29,,,,,,,,,2,2,"""Appartement""",,66,3,,,0,"""GUECHI""","""95598""",66,6364534923693533129,16513099725295730246
,,,,,,,1,2020-06-30,"""Vente""",212000.0,4,,0,"""RUE""","""861""","""RENE ECHAVIDRE""",3139,95150.0,"""TAVERNY""","""95""","""607""",,"""BX""",673,,"""4""",62.05,,,,,,,,,1,2,"""Appartement""",,63,3,,,0,"""PEDRON""","""95607""",63,10357380744557702374,9874549485008201950
,,,,,,,1,2020-06-30,"""Vente""",220000.0,5919,"""F""",18,"""RES""","""A133""","""L EMERAUDE MORNE PAVILLON""",3168,97232.0,"""LE LAMENTIN""","""972""","""213""",,"""K""",919,,"""138""",77.88,,,,,,,,,1,2,"""Appartement""",,77,4,,,0,"""HOUARD""","""972213""",77,17027493186363633437,167432590582771712


### Préparation du fichier 'bien.csv'

In [1051]:
df_bien = df_immo.select(
    'no_voie',
    'btq',
    'type_de_voie',
    'voie',
    'type_local',
    'nombre_pieces_principales',
    'surface_carrez_du_1er_lot',
    'surface_totale',
    'id_code_depcommune',
    'id_bien'
    )

In [1052]:
df_bien.filter(df_bien.is_duplicated())

no_voie,btq,type_de_voie,voie,type_local,nombre_pieces_principales,surface_carrez_du_1er_lot,surface_totale,id_code_depcommune,id_bien
i64,str,str,str,str,i64,f64,i64,str,u64


In [1053]:
df_bien

no_voie,btq,type_de_voie,voie,type_local,nombre_pieces_principales,surface_carrez_du_1er_lot,surface_totale,id_code_depcommune,id_bien
i64,str,str,str,str,i64,f64,i64,str,u64
347,,"""RUE""","""DU CHATEAU""","""Appartement""",3,48.22,48,"""01103""",15137228140333276830
4,,"""BD""","""EDOUARD BAUDOIN""","""Appartement""",1,39.11,40,"""06004""",17560310484389916846
20,"""B""","""RUE""","""MARCEAU""","""Appartement""",3,80.25,82,"""06088""",9794828718374656783
550,,"""RTE""","""DES VESPINS RN7""","""Appartement""",1,27.51,27,"""06123""",5313800244361690172
9300,,"""RES""","""LES ARPEGES BD DES ABA""","""Appartement""",2,47.33,47,"""13005""",16785843096445396717
…,…,…,…,…,…,…,…,…,…
5,,"""PL""","""JEAN CHARCOT""","""Appartement""",3,59.5,63,"""95585""",1393333401999000253
59,,"""AV""","""DE PARIS""","""Appartement""",3,55.29,66,"""95598""",6364534923693533129
4,,"""RUE""","""RENE ECHAVIDRE""","""Appartement""",3,62.05,63,"""95607""",10357380744557702374
5919,"""F""","""RES""","""L EMERAUDE MORNE PAVILLON""","""Appartement""",4,77.88,77,"""972213""",17027493186363633437


In [1054]:
df_bien = df_bien.rename({
    "type_de_voie":"type_voie",
    "voie":"nom_voie",
    "nombre_pieces_principales":"nb_pieces",
    "surface_carrez_du_1er_lot":"surface_carrez",
})

In [1055]:
df_bien = df_bien.select(
    "id_bien",
    "no_voie",
    "btq",
    "type_voie",
    "nom_voie",
    "type_local",
    "nb_pieces",
    "surface_carrez",
    "surface_totale",
    "id_code_depcommune"
)

In [1056]:
df_bien.filter(df_bien.is_duplicated())

id_bien,no_voie,btq,type_voie,nom_voie,type_local,nb_pieces,surface_carrez,surface_totale,id_code_depcommune
u64,i64,str,str,str,str,i64,f64,i64,str


In [1057]:
df_bien = df_bien.unique()

In [1058]:
df_bien.write_csv("./data_clean/bien.csv")

### Préparation du fichier 'vente.csv'

In [1059]:
df_immo.head()

code_service_ch,reference_document,1_articles_cgi,2_articles_cgi,3_articles_cgi,4_articles_cgi,5_articles_cgi,no_disposition,date_mutation,nature_mutation,valeur_fonciere,no_voie,btq,code_type_de_voie,type_de_voie,code_voie,voie,code_id_commune,code_postal,commune,code_departement,code_commune,prefixe_de_section,section,no_plan,no_volume,1er_lot,surface_carrez_du_1er_lot,2eme_lot,surface_carrez_du_2eme_lot,3eme_lot,surface_carrez_du_3eme_lot,4eme_lot,surface_carrez_du_4eme_lot,5eme_lot,surface_carrez_du_5eme_lot,nombre_de_lots,code_type_local,type_local,identifiant_local,surface_reelle_bati,nombre_pieces_principales,nature_culture,nature_culture_speciale,surface_terrain,nom_de_l_acquereur,id_code_depcommune,surface_totale,id_bien,id_vente
null,null,null,null,null,null,null,i64,date,str,f64,i64,str,i64,str,str,str,i64,f64,str,str,str,i64,str,i64,null,str,f64,null,null,null,null,null,null,null,null,i64,i64,str,null,i64,i64,str,null,i64,str,str,i64,u64,u64
,,,,,,,1,2020-01-02,"""Vente""",165000.0,347,,0,"""RUE""","""20""","""DU CHATEAU""",1,1170.0,"""CHEVRY""","""01""","""103""",,"""A""",302,,"""12""",48.22,,,,,,,,,2,2,"""Appartement""",,48,3,,,0,"""GUIRAO""","""01103""",48,15137228140333276830,9259935209207193945
,,,,,,,1,2020-01-02,"""Vente""",355680.0,4,,15,"""BD""","""1000""","""EDOUARD BAUDOIN""",205,6160.0,"""ANTIBES""","""06""","""004""",,"""CP""",186,,"""132""",39.11,,,,,,,,,2,2,"""Appartement""",,40,1,,,0,"""HARNOIS""","""06004""",40,17560310484389916846,13813125794198133400
,,,,,,,1,2020-01-02,"""Vente""",229500.0,20,"""B""",0,"""RUE""","""3975""","""MARCEAU""",142,6000.0,"""NICE""","""06""","""088""",,"""LS""",169,,"""99""",80.25,,,,,,,,,1,2,"""Appartement""",,82,3,,,0,"""ROGIER""","""06088""",82,9794828718374656783,15600841261221439612
,,,,,,,1,2020-01-02,"""Vente""",125000.0,550,,3,"""RTE""","""1011""","""DES VESPINS RN7""",228,6700.0,"""SAINT LAURENT DU VAR""","""06""","""123""",,"""AO""",348,,"""242""",27.51,,,,,,,,,1,2,"""Appartement""",,27,1,,,0,"""BOCQUIER""","""06123""",27,5313800244361690172,16145551626827266416
,,,,,,,1,2020-01-02,"""Vente""",90000.0,9300,,18,"""RES""","""A084""","""LES ARPEGES BD DES ABA""",326,13400.0,"""AUBAGNE""","""13""","""005""",,"""AW""",224,,"""218""",47.33,,,,,,,,,1,2,"""Appartement""",,47,2,,,0,"""GUILLOSSOU""","""13005""",47,16785843096445396717,5698024270704249500


In [1060]:
df_vente = df_immo.select(
    'id_vente',
    'nom_de_l_acquereur',
    'id_bien',
    'date_mutation',
    'valeur_fonciere'
)

In [1061]:
df_vente

id_vente,nom_de_l_acquereur,id_bien,date_mutation,valeur_fonciere
u64,str,u64,date,f64
9259935209207193945,"""GUIRAO""",15137228140333276830,2020-01-02,165000.0
13813125794198133400,"""HARNOIS""",17560310484389916846,2020-01-02,355680.0
15600841261221439612,"""ROGIER""",9794828718374656783,2020-01-02,229500.0
16145551626827266416,"""BOCQUIER""",5313800244361690172,2020-01-02,125000.0
5698024270704249500,"""GUILLOSSOU""",16785843096445396717,2020-01-02,90000.0
…,…,…,…,…
6466178643350014766,"""SAADOUN""",1393333401999000253,2020-06-30,115000.0
16513099725295730246,"""GUECHI""",6364534923693533129,2020-06-30,129000.0
9874549485008201950,"""PEDRON""",10357380744557702374,2020-06-30,212000.0
167432590582771712,"""HOUARD""",17027493186363633437,2020-06-30,220000.0


In [1062]:
df_vente = df_vente.rename({
    "nom_de_l_acquereur":"nom_acquereur",
    "date_mutation":"date_vente"
    })

In [1063]:
df_vente = df_vente.select(
    "id_vente",
    "id_bien",
    "date_vente",
    "valeur_fonciere",
    "nom_acquereur"
)

In [1064]:
df_vente.write_csv('./data_clean/vente.csv')

### Préparation du fichier 'commune.csv'

In [1065]:
df_com

codreg,coddep,codarr,codcan,codcom,com,pmun,pcap,ptot
i64,str,str,str,str,str,i64,i64,i64
84,"""01""","""02""","""08""","""001""","""L'Abergement-Clémenciat""",779,19,798
84,"""01""","""01""","""01""","""002""","""L'Abergement-de-Varey""",256,1,257
84,"""01""","""01""","""01""","""004""","""Ambérieu-en-Bugey""",14134,380,14514
84,"""01""","""02""","""22""","""005""","""Ambérieux-en-Dombes""",1751,25,1776
84,"""01""","""01""","""04""","""006""","""Ambléon""",112,6,118
…,…,…,…,…,…,…,…,…
4,"""974""","""01""","""04""","""420""","""Sainte-Suzanne""",24065,227,24292
4,"""974""","""03""","""06""","""421""","""Salazie""",7136,73,7209
4,"""974""","""02""","""99""","""422""","""Le Tampon""",79824,1009,80833
4,"""974""","""04""","""14""","""423""","""Les Trois-Bassins""",7015,91,7106


In [1066]:
df_com.null_count()

codreg,coddep,codarr,codcan,codcom,com,pmun,pcap,ptot
u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,1,1,0,0,0,0,0


In [1067]:
df_com = df_com.with_columns(
    pl.concat_str([
        pl.col("coddep"),
        pl.col("codcom")
    ])
    .alias('id_code_depcommune')
)

In [1068]:
df_com

codreg,coddep,codarr,codcan,codcom,com,pmun,pcap,ptot,id_code_depcommune
i64,str,str,str,str,str,i64,i64,i64,str
84,"""01""","""02""","""08""","""001""","""L'Abergement-Clémenciat""",779,19,798,"""01001"""
84,"""01""","""01""","""01""","""002""","""L'Abergement-de-Varey""",256,1,257,"""01002"""
84,"""01""","""01""","""01""","""004""","""Ambérieu-en-Bugey""",14134,380,14514,"""01004"""
84,"""01""","""02""","""22""","""005""","""Ambérieux-en-Dombes""",1751,25,1776,"""01005"""
84,"""01""","""01""","""04""","""006""","""Ambléon""",112,6,118,"""01006"""
…,…,…,…,…,…,…,…,…,…
4,"""974""","""01""","""04""","""420""","""Sainte-Suzanne""",24065,227,24292,"""974420"""
4,"""974""","""03""","""06""","""421""","""Salazie""",7136,73,7209,"""974421"""
4,"""974""","""02""","""99""","""422""","""Le Tampon""",79824,1009,80833,"""974422"""
4,"""974""","""04""","""14""","""423""","""Les Trois-Bassins""",7015,91,7106,"""974423"""


In [1069]:
df_commune = df_com.select(
    "codreg",
    "coddep",
    "codcom",
    "com",
    "ptot",
    "id_code_depcommune"
)

In [1070]:
df_commune

codreg,coddep,codcom,com,ptot,id_code_depcommune
i64,str,str,str,i64,str
84,"""01""","""001""","""L'Abergement-Clémenciat""",798,"""01001"""
84,"""01""","""002""","""L'Abergement-de-Varey""",257,"""01002"""
84,"""01""","""004""","""Ambérieu-en-Bugey""",14514,"""01004"""
84,"""01""","""005""","""Ambérieux-en-Dombes""",1776,"""01005"""
84,"""01""","""006""","""Ambléon""",118,"""01006"""
…,…,…,…,…,…
4,"""974""","""420""","""Sainte-Suzanne""",24292,"""974420"""
4,"""974""","""421""","""Salazie""",7209,"""974421"""
4,"""974""","""422""","""Le Tampon""",80833,"""974422"""
4,"""974""","""423""","""Les Trois-Bassins""",7106,"""974423"""


In [1071]:
df_commune = df_commune.rename({
    "codreg":"id_reg",
    "coddep":"code_dep",
    "codcom":"code_commune",
    "com":"nom_commune",
    "ptot":"pop_totale"
})

In [1072]:
df_commune = df_commune.select(
    "id_code_depcommune",
    "id_reg",
    "code_dep",
    "code_commune",
    "pop_totale",
    "nom_commune"
)

In [1073]:
df_commune

id_code_depcommune,id_reg,code_dep,code_commune,pop_totale,nom_commune
str,i64,str,str,i64,str
"""01001""",84,"""01""","""001""",798,"""L'Abergement-Clémenciat"""
"""01002""",84,"""01""","""002""",257,"""L'Abergement-de-Varey"""
"""01004""",84,"""01""","""004""",14514,"""Ambérieu-en-Bugey"""
"""01005""",84,"""01""","""005""",1776,"""Ambérieux-en-Dombes"""
"""01006""",84,"""01""","""006""",118,"""Ambléon"""
…,…,…,…,…,…
"""974420""",4,"""974""","""420""",24292,"""Sainte-Suzanne"""
"""974421""",4,"""974""","""421""",7209,"""Salazie"""
"""974422""",4,"""974""","""422""",80833,"""Le Tampon"""
"""974423""",4,"""974""","""423""",7106,"""Les Trois-Bassins"""


In [1074]:
df_commune.write_csv('./data_clean/commune.csv')

### Préparation du fichier 'region.csv'

In [1075]:
df_geo.null_count()

regrgp_nom,reg_nom,reg_nom_old,aca_nom,dep_nom,com_code,com_code1,com_code2,com_id,com_nom_maj_court,com_nom_maj,com_nom,uu_code,uu_id,uucr_id,uucr_nom,ze_id,dep_code,dep_id,dep_nom_num,dep_num_nom,aca_code,aca_id,reg_code,reg_id,reg_code_old,reg_id_old,fd_id,fr_id,fe_id,uu_id_99,au_code,au_id,auc_id,auc_nom,uu_id_10,geolocalisation
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,366,366,0,0,0,0,31432,0,0,0,0,366,0,0,0,0,0,0,0,0,0,0,0,0,0,17472,0,0,0,0,2171


In [1076]:
df_geo

regrgp_nom,reg_nom,reg_nom_old,aca_nom,dep_nom,com_code,com_code1,com_code2,com_id,com_nom_maj_court,com_nom_maj,com_nom,uu_code,uu_id,uucr_id,uucr_nom,ze_id,dep_code,dep_id,dep_nom_num,dep_num_nom,aca_code,aca_id,reg_code,reg_id,reg_code_old,reg_id_old,fd_id,fr_id,fe_id,uu_id_99,au_code,au_id,auc_id,auc_nom,uu_id_10,geolocalisation
str,str,str,str,str,str,i64,i64,str,str,str,str,i64,str,str,str,str,i64,str,str,str,i64,str,i64,str,i64,str,str,str,str,str,str,str,str,str,str,str
"""Province""","""Auvergne-Rhône-Alpes""","""Rhône-Alpes""","""Lyon""","""Ain""","""01001""",1001,1001,"""C01001""","""L ABERGEMENT CLEMENCIAT""","""L'ABERGEMENT-CLEMENCIAT""","""L'Abergement-Clémenciat""",,"""SO""","""CR01001""","""L'Abergement-Clémenciat""","""ZE8213""",1,"""D001""","""Ain (01)""","""01 - Ain""",10,"""A10""",84,"""R84""",82,"""R82""","""FD111""","""FR11""","""FE1""","""SO""",,"""AU997""","""C01001""","""L'Abergement-Clémenciat""","""SO""","""46.1534255214,4.92611354223"""
"""Province""","""Auvergne-Rhône-Alpes""","""Rhône-Alpes""","""Lyon""","""Ain""","""01002""",1002,1002,"""C01002""","""L ABERGEMENT DE VAREY""","""L'ABERGEMENT-DE-VAREY""","""L'Abergement-de-Varey""",,"""SO""","""CR01002""","""L'Abergement-de-Varey""","""ZE8201""",1,"""D001""","""Ain (01)""","""01 - Ain""",10,"""A10""",84,"""R84""",82,"""R82""","""FD111""","""FR11""","""FE1""","""SO""","""2""","""AU002""","""AU002""","""Lyon""","""SO""","""46.0091878776,5.42801696363"""
"""Province""","""Auvergne-Rhône-Alpes""","""Rhône-Alpes""","""Lyon""","""Ain""","""01003""",1003,1003,"""C01003""","""AMAREINS""","""AMAREINS""","""Amareins""",,"""SO""","""SO""","""Amareins""","""SO""",1,"""D001""","""Ain (01)""","""01 - Ain""",10,"""A10""",84,"""R84""",82,"""R82""","""FD111""","""FR11""","""FE1""","""SO""","""SO""","""SO""","""SO""","""SO""","""SO""",
"""Province""","""Auvergne-Rhône-Alpes""","""Rhône-Alpes""","""Lyon""","""Ain""","""01004""",1004,1004,"""C01004""","""AMBERIEU EN BUGEY""","""AMBERIEU-EN-BUGEY""","""Ambérieu-en-Bugey""",1303,"""UU01303""","""UU01303""","""Ambérieu-en-Bugey""","""ZE8201""",1,"""D001""","""Ain (01)""","""01 - Ain""",10,"""A10""",84,"""R84""",82,"""R82""","""FD111""","""FR11""","""FE1""","""UU01303""","""2""","""AU002""","""AU002""","""Lyon""","""UU01302""","""45.9608475114,5.3729257777"""
"""Province""","""Auvergne-Rhône-Alpes""","""Rhône-Alpes""","""Lyon""","""Ain""","""01005""",1005,1005,"""C01005""","""AMBERIEUX EN DOMBES""","""AMBERIEUX-EN-DOMBES""","""Ambérieux-en-Dombes""",,"""SO""","""CR01005""","""Ambérieux-en-Dombes""","""ZE8213""",1,"""D001""","""Ain (01)""","""01 - Ain""",10,"""A10""",84,"""R84""",82,"""R82""","""FD111""","""FR11""","""FE1""","""SO""","""2""","""AU002""","""AU002""","""Lyon""","""SO""","""45.9961799872,4.91227250796"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Province""","""Corse""","""Corse""","""Corse""","""Haute-Corse""","""2B356""",,,"""C2B356""","""ZALANA""","""ZALANA""","""Zalana""",,"""SO""","""CR2B356""","""Zalana""","""ZE9404""",,"""D02B""","""Haute-Corse (2B)""","""2B - Haute-Corse""",27,"""A27""",94,"""R94""",94,"""R94""","""FD111""","""FR11""","""FE1""","""SO""",,"""AU998""","""C2B356""","""Zalana""","""SO""","""42.2536105064,9.38216254299"""
"""Province""","""Corse""","""Corse""","""Corse""","""Haute-Corse""","""2B361""",,,"""C2B361""","""ZILIA""","""ZILIA""","""Zilia""",,"""SO""","""CR2B361""","""Zilia""","""ZE9405""",,"""D02B""","""Haute-Corse (2B)""","""2B - Haute-Corse""",27,"""A27""",94,"""R94""",94,"""R94""","""FD111""","""FR11""","""FE1""","""SO""","""391""","""AU391""","""AU391""","""Calvi""","""SO""","""42.5186011027,8.90376320006"""
"""Province""","""Corse""","""Corse""","""Corse""","""Haute-Corse""","""2B364""",,,"""C2B364""","""ZUANI""","""ZUANI""","""Zuani""",,"""SO""","""CR2B364""","""Zuani""","""ZE9404""",,"""D02B""","""Haute-Corse (2B)""","""2B - Haute-Corse""",27,"""A27""",94,"""R94""",94,"""R94""","""FD111""","""FR11""","""FE1""","""SO""",,"""AU998""","""C2B364""","""Zuani""","""SO""","""42.264826425,9.34126627348"""
"""Province""","""Corse""","""Corse""","""Corse""","""Haute-Corse""","""2B365""",,,"""C2B365""","""SAN GAVINO DI FIUMORBO""","""SAN-GAVINO-DI-FIUMORBO""","""San-Gavino-di-Fiumorbo""",,"""SO""","""CR2B365""","""San-Gavino-di-Fiumorbo""","""ZE9407""",,"""D02B""","""Haute-Corse (2B)""","""2B - Haute-Corse""",27,"""A27""",94,"""R94""",94,"""R94""","""FD111""","""FR11""","""FE1""","""SO""",,"""AU000""","""C2B365""","""San-Gavino-di-Fiumorbo""","""SO""","""41.9714498244,9.24775602009"""


In [1077]:
df_region = df_geo.select(
    "reg_code",
    "regrgp_nom",
    "reg_nom"
)

In [1078]:
df_region

reg_code,regrgp_nom,reg_nom
i64,str,str
84,"""Province""","""Auvergne-Rhône-Alpes"""
84,"""Province""","""Auvergne-Rhône-Alpes"""
84,"""Province""","""Auvergne-Rhône-Alpes"""
84,"""Province""","""Auvergne-Rhône-Alpes"""
84,"""Province""","""Auvergne-Rhône-Alpes"""
…,…,…
94,"""Province""","""Corse"""
94,"""Province""","""Corse"""
94,"""Province""","""Corse"""
94,"""Province""","""Corse"""


In [1079]:
df_region = df_region.rename({
    "reg_code":"id_reg",
    "regrgp_nom":"nom_regroupement",
    "reg_nom":"nom_region"
})

In [1080]:
df_region = df_region.select(
    "id_reg",
    "nom_region",
    "nom_regroupement"
    )

In [1081]:
df_region = df_region.unique()

In [1082]:
df_region

id_reg,nom_region,nom_regroupement
i64,str,str
3,"""Guyane""","""DROM-COM"""
2,"""Martinique""","""DROM-COM"""
4,"""La Réunion""","""DROM-COM"""
53,"""Bretagne""","""Province"""
93,"""Provence-Alpes-Côte d'Azur""","""Province"""
…,…,…
0,"""Collectivités d'outre-mer""","""DROM-COM"""
94,"""Corse""","""Province"""
6,"""Mayotte""","""DROM-COM"""
27,"""Bourgogne-Franche-Comté""","""Province"""


In [1083]:
df_region.write_csv('./data_clean/region.csv')

In [1095]:
df_vente.select(pl.col('date_vente').unique()).tail(10)

date_vente
date
2020-06-19
2020-06-20
2020-06-22
2020-06-23
2020-06-24
2020-06-25
2020-06-26
2020-06-27
2020-06-29
2020-06-30
