# Setup project

In [1]:
import pandas as pd
import numpy as np
import re

In [13]:
df_cantons = pd.read_csv('cantons_cleaned.csv')

In [142]:
df_20xx = pd.read_csv('reces_2012_canton_ville_D.csv', sep=';', low_memory=False)

In [143]:
annee = 2012

In [144]:
csv_name = "recens_" + str(annee) + ".csv"

## Constants

In [5]:
# used to filter the cantons ds csv (find it on the drive)
DEPARTEMENTS = ['16',
                '17',
                '19',
                '23',
                '24',
                '33',
                '40',
                '47',
                '64',
                '79',
                '86',
                '87']
DEPARTEMENTS_INT = [int(numeric_string) for numeric_string in DEPARTEMENTS]

In [59]:
# columns to keep in each pass
COLUMNS_TO_KEEPv1 = [
    "annee",
    "departement",
    "AGED",
    "ASCEN",
    "CATL",
    "CATPC",
    "CHFL",
    "CMBL",
    "DIPL",
    "EMPL",
    "ETUD",
    "GARL",
    "HLML",
    "ILT",
    "IMMI",
    "INAI",
    "INFAM",
    "LIENF",
    "MATR",
    "MOCO",
    "MODV",
    "NA17",
    "NAIDT",
    "NBPI",
    "NENFR",
    "NPERR",
    "ORIDT",
    "RECH",
    "SANI",
    "SFM",
    "STOCD",
    "SURF",
    "TACTD16",
    "TP",
    "TRANS",
    "TYPL",
    "VOIT",
]

In [60]:
# columns to keep in each pass
COLUMNS_TO_KEEP = [
    "annee",
    "departement",
    "AGED",
    "ASCEN",
    "CATL",
    "CATPC",
    "CHFL",
    "CMBL",
#    "DIPL",
    "EMPL",
    "ETUD",
    "GARL",
    "HLML",
    "ILT",
    "IMMI",
    "INAI",
    "INFAM",
    "LIENF",
#    "MATR",
    "MOCO",
    "MODV",
    "NA17",
    "NAIDT",
    "NBPI",
    "NENFR",
    "NPERR",
    "ORIDT",
    "RECH",
    "SANI",
    "SFM",
    "STOCD",
    "SURF",
    "TACTD16",
    "TP",
    "TRANS",
    "TYPL",
    "VOIT",
]

In [7]:
# columns that need to be transformed to dummies
TO_DUMMIES = [
    "ASCEN",
    "CATL",
    "CHFL",
    "CMBL",
    "DIPL",
    "EMPL",
    "GARL",
    "HLML",
    "ILT",
    "INFAM",
    "LIENF",
    "NA17",
    "NBPI",
    "NENFR",
    "NPERR",
    "RECH",
    "SANI",
    "SFM",
    "STOCD",
    "SURF",
    "TP",
    "TRANS",
    "TYPL",
    "VOIT",
]

# Study dataset

In [None]:
df_2010.info()

In [None]:
df_2010.head()

In [None]:
df_2010['CANTVILLE'].value_counts()

In [None]:
df_cantons.info()

# Work on dataset

## Test

In [8]:
# replace next df by this to try pipeline
test = df_20xx.sample(n=100)

## Work

In [145]:
# change df used by current_df to test
current_df = df_20xx

In [146]:
current_df['annee'] = annee

In [147]:
# filter year and extract rows that have a cantville in the Nouvelle-Aquitaine cantons dataset
filtered_df = current_df[current_df['CANTVILLE'].isin(df_cantons['Canton-ou-ville'])]

In [148]:
# merge filtered and cantons df to get cantons departement for each rows of the filtered df
merged_df = pd.merge(filtered_df, df_cantons, how='left', left_on='CANTVILLE', right_on='Canton-ou-ville')

In [149]:
# rename departement column to match syntax rules
merged_df = merged_df.rename(columns={
    'Département': 'departement'
})

In [150]:
# filter again the df to keep allowed columns
merged_df = merged_df.loc[:, COLUMNS_TO_KEEP]

In [151]:
merged_df.head()

Unnamed: 0,annee,departement,AGED,ASCEN,CATL,CATPC,CHFL,CMBL,EMPL,ETUD,...,RECH,SANI,SFM,STOCD,SURF,TACTD16,TP,TRANS,TYPL,VOIT
0,2012,19,23,2,1,0,3,4,16,2,...,Z,2,11,21,2,111,1,2,2,1
1,2012,19,76,2,1,0,1,3,ZZ,2,...,0,2,12,21,1,210,Z,Z,2,0
2,2012,19,45,2,1,0,2,3,22,2,...,Z,2,32,10,3,111,1,4,1,2
3,2012,19,13,2,1,0,2,3,ZZ,1,...,Z,2,32,10,3,230,Z,Z,1,2
4,2012,19,16,2,1,0,2,3,ZZ,1,...,0,2,32,10,3,220,Z,Z,1,2


In [152]:
merged_df.to_csv(csv_name, index=False)