# Adding communal features and levers

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
import time
import pickle

In [2]:
path_project = Path.home() / Path('Google Drive/Felix')
path_data = path_project / Path("data")
path_dump = path_project / Path("dump")

In [3]:
# loading MergeCommunesEnvi data
file = path_data / Path("MergeCommunesEnvi.csv")
with Path.open(file, 'rb') as fp:
    MergeCommunesEnvi = pd.read_csv(fp,  encoding='cp1252',low_memory=False, sep=';', index_col = 1)

In [4]:
# loading cdv data
file = path_data / Path("felix.csv")
with Path.open(file, 'rb') as fp:
    cdv = pd.read_csv(fp,  encoding='cp1252',low_memory=False, index_col = 0)

In [5]:
filename = path_dump / Path("dict_var_groups.sav")
with open(filename, 'rb') as fp:
     dict_var_groups = pickle.load(fp)

In [6]:
# file 'List-of-Actionable-Variables_v0.1_sp' september 01
indiv_act_var = {
    "LIMVIAND","VACANCES","VISITFAM","RECEP","YOGA","FREQSPOR","FREQBIBL","FREQCINE",
    "FREQTELE","ASSOSPOR","ASSOCULT","ASSOCONF","ASSOJEUN","ASSOSYND","ASSOENVI",
    "ASSOPARE","ASSOCONS","ASSOPOLI","ASSOHUMA","ASSOAUTR","NOT_FAMI","NOT_PROF",
    "NOT_AMIS","NOT_COHE","NOT_POLI","NOT_LIBR","NOT_LOG","NOT_CAD","RELIGION"
}

In [7]:
# file 'List-of-Actionable-Variables_v0.1_sp' september 01
indiv_semi_act_var = {
    "SITUEMP5","SITUEMP6","TEMPSTRA","nbheures","NBHEUR39","NBHEUR35",
    "IMAGTRAV","COUPLE","ENFANTS","CADVIE","CADVIE3","MODCHAUF","ETATSAN",
    "BANQMOB","BANQEPA","BANQVIE","TELMOB","CONFPUB","CONFENTR","CONFASSO",
    "CONFPOLI","CONFBANK","CONFPRES","CONFECOL","CONFKEUF","INQMALAD",
    "INQMALA3","INQAGRES","INQAGRE3","INQROUTE","INQROUT3","INQCHOMA",
    "INQCHOM3","INQGUERR","INQGUER3","INQNUCLE","INQNUCL3","INQALIM",
    "INQALIM3","ECHPOL"
}

In [8]:
admin_act_var = {
    "AIDESUFF","EFFORTPP","CHOAVANT","OPIRSA","JUSTICE","RELEG","RADIQUOI",
    "RADWHY1","RADWHY2","RADWHY3","RADWHY4","RADWHY5","RADWHY6","RADWHY7",
    "RADWHY8","RADWHY9","RADWHY10","RADWHY11","RADWHY12","RADWHY13","RADWHY14",
    "ORDLIB","PREOCCU1","PREOCCU2","CONFGOUV"
}

In [9]:
admin_semi_act_var = {
    "SECURITE","SECUR3","ADNSTIC","ADNCB","ADNORDI","ROBOT1","ROBOT2","ROBOT3",
    "PRESTCAF","REVPF","CONFPUB","CONFENTR","CONFASSO","CONFPOLI","CONFBANK",
    "CONFPRES","CONFECOL","CONFKEUF","TRANSFST","TRANSFO5","PROGRAD","OPIIMMIG"
}

## Data exploration

In [10]:
commune_var = set(MergeCommunesEnvi.columns) - set(cdv.columns)

In [11]:
print(f"{len(commune_var)} additional features")

218 additional features


In [12]:
df = MergeCommunesEnvi.loc[:,commune_var]

In [13]:
np.sum(df.isnull()).sort_values(ascending = False)

TP6015                                                     2830
PIMP15                                                     2505
Part.protection.forte...2017....                            840
Superficie.protection.contractuelle...2017..ha.             840
Part.protection.contractuelle...2017....                    840
Superficie.protection.forte...2017..ha.                     840
Part.zones.humides.et.surfaces.en.eau...2012....            358
Superficie.zones.humides.et.surfaces.en.eau...2012..ha.     358
Part.forêts.et.milieux.semi.naturels...2012....             358
Superficie.forêts.et.milieux.semi.naturels...2012..ha.      358
communes                                                    358
MED15                                                       212
NBMENFISC15                                                 212
NB_F101_NB_ECL                                              164
NB_D101                                                     164
NB_F106_NB_AIREJEU                      

In [14]:
df.dtypes.unique()

array([dtype('float64'), dtype('O')], dtype=object)

In [15]:
df.select_dtypes(include=['O']).columns

Index(['LIBGEO', 'DEP', 'communes'], dtype='object')

## Updating var dictionary

In [16]:
dict_var_groups["cat_min10_var"] = dict_var_groups["cat_min10_var"] | {'DEP', 'LIBGEO', 'communes'}
dict_var_groups["cat_var"] = dict_var_groups["cat_var"] | {'DEP', 'LIBGEO', 'communes'}

In [17]:
dict_var_groups["commune_var"] = commune_var

In [18]:
commune_quant_var = set(df.select_dtypes(include=['float64']).columns)
dict_var_groups["quant_var"] = dict_var_groups["quant_var"] | commune_quant_var

In [19]:
scope_2015_var = dict_var_groups['scope_2015_var']
scope_2016_var = dict_var_groups['scope_2016_var']
scope_2017_var = dict_var_groups['scope_2017_var']
scope_2018_var = dict_var_groups['scope_2018_var']
scope_2015_2018_var = dict_var_groups['scope_2015_2018_var']
scope_2016_2018_var = dict_var_groups['scope_2016_2018_var']
scope_2017_2018_var = dict_var_groups['scope_2017_2018_var']

scope_2015_ext_var = scope_2015_var | commune_var
scope_2016_ext_var = scope_2016_var | commune_var
scope_2017_ext_var = scope_2017_var | commune_var
scope_2018_ext_var = scope_2018_var | commune_var
scope_2015_2018_ext_var = scope_2015_2018_var | commune_var
scope_2016_2018_ext_var = scope_2016_2018_var | commune_var
scope_2017_2018_ext_var = scope_2017_2018_var | commune_var

dict_var_groups["scope_2015_ext_var"] = scope_2015_ext_var
dict_var_groups["scope_2016_ext_var"] = scope_2016_ext_var
dict_var_groups["scope_2017_ext_var"] = scope_2017_ext_var
dict_var_groups["scope_2018_ext_var"] = scope_2018_ext_var
dict_var_groups["scope_2015_2018_ext_var"] = scope_2015_2018_ext_var
dict_var_groups["scope_2016_2018_ext_var"] = scope_2016_2018_ext_var
dict_var_groups["scope_2017_2018_ext_var"] = scope_2017_2018_ext_var

In [20]:
dict_var_groups["indiv_semi_act_var"] = indiv_semi_act_var
dict_var_groups["indiv_act_var"] = indiv_act_var
dict_var_groups["admin_semi_act_var"] = admin_semi_act_var
dict_var_groups["admin_act_var"] = admin_act_var

In [26]:
com_var = dict_var_groups['com_var']
tech_var = dict_var_groups['tech_var']
text_var = dict_var_groups['text_var']
bizz_var = dict_var_groups['bizz_var']

cat_max9_var = dict_var_groups['cat_max9_var']
quant_var = dict_var_groups['quant_var']

exclusion = com_var | tech_var | bizz_var | text_var 

quant_null = np.sum(MergeCommunesEnvi.loc[:,quant_var].isnull())
quant_var_kept = set(quant_null[quant_null < 200].index)

usual_common_scope = ((cat_max9_var | quant_var_kept) & scope_2015_2018_ext_var) - exclusion

dict_var_groups["exclusion"] = exclusion
dict_var_groups["usual_common_scope"] = usual_common_scope

In [27]:
filename = path_dump / Path("dict_var_groups.sav")
with open(filename, 'wb') as fp:
     pickle.dump(dict_var_groups,fp,pickle.HIGHEST_PROTOCOL)

In [28]:
df.shape

(11131, 218)