# Setup project

In [1]:
import pandas as pd
import numpy as np
import re

In [181]:
df_cantons = pd.read_csv('cantons_cleaned.csv')

In [265]:
annee = 2006

In [266]:
df_20xx = pd.read_csv('reces_' + str(annee) + '_canton_ville_D.csv', sep=';', low_memory=False)

In [267]:
csv_name = "recens_" + str(annee) + ".csv"

## Constants

In [22]:
# used to filter the cantons ds csv (find it on the drive)
DEPARTEMENTS = ['16',
                '17',
                '19',
                '23',
                '24',
                '33',
                '40',
                '47',
                '64',
                '79',
                '86',
                '87']
DEPARTEMENTS_INT = [int(numeric_string) for numeric_string in DEPARTEMENTS]

In [23]:
# columns to keep in each pass
COLUMNS_TO_KEEP = [
    "annee",
    "departement",
    "AGED",
    "ASCEN",
    "CATL",
    "CATPC",
    "CHFL",
    "CMBL",
#    "DIPL",
    "EMPL",
    "ETUD",
    "GARL",
    "HLML",
    "ILT",
    "IMMI",
    "INAI",
    "INFAM",
    "LIENF",
#    "MATR",
    "MOCO",
    "MODV",
#    "NA17",
    "NAIDT",
    "NBPI",
#    "NENFR",
    "NPERR",
    "ORIDT",
    "RECH",
    "SANI",
    "SFM",
    "STOCD",
    "SURF",
#    "TACTD16",
    "TP",
    "TRANS",
    "TYPL",
    "VOIT",
]

In [24]:
# columns that need to be transformed to dummies
TO_DUMMIES = [
    "AGED",
    "ASCEN",
    "CATL",
    "CATPC",
    "CHFL",
    "CMBL",
#    "DIPL",
    "EMPL",
    "ETUD",
    "GARL",
    "HLML",
    "ILT",
    "IMMI",
    "INAI",
    "INFAM",
    "LIENF",
    "MOCO",
    "MODV",
#    "NA17",
    "NAIDT",
    "NBPI",
#    "NENFR",
    "NPERR",
    "ORIDT",
    "RECH",
    "SANI",
    "SFM",
    "STOCD",
    "SURF",
    "TP",
    "TRANS",
    "TYPL",
    "VOIT",
]

# Study dataset

In [None]:
df_20xx.info()

In [None]:
df_20xx.head()

In [None]:
df_20xx['CANTVILLE'].value_counts()

In [None]:
df_cantons.info()

# Work on dataset

## Test

In [None]:
# replace next df by this to try pipeline
test = df_20xx.sample(n=100000)

## Work

In [268]:
# change df used by current_df to test
current_df = df_20xx

In [269]:
current_df['annee'] = annee

In [270]:
# filter year and extract rows that have a cantville in the Nouvelle-Aquitaine cantons dataset
filtered_df = current_df[current_df['CANTVILLE'].isin(df_cantons['Canton-ou-ville'])]

In [271]:
# merge filtered and cantons df to get cantons departement for each rows of the filtered df
merged_df = pd.merge(filtered_df, df_cantons, how='left', left_on='CANTVILLE', right_on='Canton-ou-ville')

In [272]:
# rename departement column to match syntax rules
merged_df = merged_df.rename(columns={
    'Département': 'departement'
})

In [273]:
# filter again the df to keep allowed columns
merged_df = merged_df.loc[:, COLUMNS_TO_KEEP]

In [274]:
merged_df = pd.get_dummies(merged_df, columns=TO_DUMMIES, drop_first=True)

In [275]:
for col in merged_df:
    if merged_df[col].dtypes == bool:
        merged_df[col] = merged_df[col].map({True: 1, False: 0}) # do not replace by "replace()" map is a lot faster

In [276]:
merged_df = merged_df.groupby(['annee', 'departement']).sum()

In [264]:
merged_df.to_csv(csv_name, index=True)