# Felix - dataset preparation

A dataset is build based on "Etude condistions de vie" merged with "INSEE" communal data ...
....  
Categorial features are ... K-1 categ
...
Set of variables and featiures ...

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
import time
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV, RFE, SelectKBest, chi2, SelectFromModel
from sklearn.utils import resample

In [2]:
path_project = Path.home() / Path('Google Drive/Felix')
path_data = path_project / Path("data")
path_dump = path_project / Path("dump")

## Feature sets and engineering - Dataset preparation

In [3]:
# loading cdv data
file = path_data / Path("felix.csv")
with Path.open(file, 'rb') as fp:
    cdv = pd.read_csv(fp,  encoding='cp1252',low_memory=False, index_col = 0)

In [4]:
# loadind cdv data without format
file = path_data / Path("felix_ssfmt.csv")
with Path.open(file, 'rb') as fp:
    cdv_ssfmt = pd.read_csv(fp,  encoding='cp1252',low_memory=False, index_col = 0)

### Feature scope

In [5]:
# number of line per year in teh dataset
n_per_year = cdv["ANNEEFUZ"].value_counts()
# number of missing value per variable for a given year
na_2015 = np.sum(cdv.loc[cdv["ANNEEFUZ"] == 2015].isnull()) 
na_2016 = np.sum(cdv.loc[cdv["ANNEEFUZ"] == 2016].isnull()) 
na_2017 = np.sum(cdv.loc[cdv["ANNEEFUZ"] == 2017].isnull()) 
na_2018 = np.sum(cdv.loc[cdv["ANNEEFUZ"] == 2018].isnull()) 
# column scope per year 
cdv_2015_var = set(na_2015[na_2015 < n_per_year[2015]].index)
cdv_2016_var = set(na_2016[na_2016 < n_per_year[2016]].index)
cdv_2017_var = set(na_2017[na_2017 < n_per_year[2017]].index)
cdv_2018_var = set(na_2018[na_2018 < n_per_year[2018]].index)

In [6]:
cdv_2015_2018_var = (cdv_2015_var & cdv_2016_var & cdv_2017_var & cdv_2018_var)
cdv_2016_2018_var = (cdv_2016_var & cdv_2017_var & cdv_2018_var)
cdv_2017_2018_var = (cdv_2017_var & cdv_2018_var)

In [7]:
print(f"{len(cdv_2015_2018_var)} variables common to all study out of {cdv_ssfmt.shape[1]}")

267 variables common to all study out of 353


### Special variables 

In [8]:
cdv_var = set(cdv.columns)
pred_var = {"HEUREUX"}
tech_var = {
    "ANNEEFUZ", "ANNEFUZ2", "COLLECTE", "CHAMP", 
    "identifiant", "an_enq", "INTER"
}
com_var = {'COMINSEE', 'DEPCOM', 'com', 'inseel','inseenum','CP'}
text_var = {'RADIQUOI'}
bizz_var = {
    'NB0003','NB0306','NB0610','NB1016','NB1620','NB2099',
    'an_nais','decuc','decsqt','info','typodeg','refus2', 
    'cpt', 'prescaf', 'poptrpeu','REVUC','i','REVTOT', 
    'poppeud','popdense', 'popinter', 'pmun', 'agedip', 'age_OW',
    'REVsqt', 'NBUC', 'AGGLOINS', 'med', 'CSP6','REVTOT6',
    'ACM1','ACM2','ACM3','ACM4','ACM5', 'ACM6', 'ACM7',
    'ACM8','ACM9', 'ACM10', 'ACM11','ACM12'
}
calc_cdv_var = set()

In [9]:
non_redundant_cdv_var = {
    'zau2010','YOGA','WHYLIM','VOITURE','VISITFAM','VACANCES',
    'UNIONGAY','UDA10','TYPOSQT','TYPLOG2','TYPLOG',
    'TYPEMPL','type99','TYPCONT','TYPCHAUF','TRAVFEM','TRANSFST','TRANSFO5',
    'TEMPSTRA','TELMOB','TAXENV','statut99',
    'STATMAT','STATLOGB','SOUFFTET','SOUFFNER','SOUFFINS',
    'SOUFFDOS','SOUFFDEP','SITUFAM','SITUEMP5','SITUEMP',
    'SEXE_9','SEXE_8','SEXE_7','SEXE_6','SEXE_5','SEXE_4',
    'SEXE_3','SEXE_2','SEXE','SENSIENV','SECURITE','SALCOMPI',
    'SALCOMPC','RURAURBA','ROBOT3','ROBOT2','ROBOT1',
    'revtot7','REVTOT','REVPF','REVENQ','REVCONJ','REVAUTR',
    'REVAUON','REV_TR7','RESTRICT','RESIDALT','RELIGION','RELEG',
    'refus2','RECEP','RE_WEB','RE_VOIT','RE_VAC',
    'RE_TABAL','RE_MEDI','RE_LOG','RE_HABI','RE_EQUI',
    'RE_ENF','RE_ALIM','RAISPAUV','RADWHY9','RADWHY8','RADWHY7',
    'RADWHY6','RADWHY5','RADWHY4','RADWHY3','RADWHY2','RADWHY14',
    'RADWHY13','RADWHY12','RADWHY11','RADWHY10','RADWHY1',
    'RADI3','RADI2','RADI1','PROGRAD','PRIVPUB','PRESTCAF',
    'prescaf','PREOVIO','PREOTENS','PREOPAUM',
    'PREOPAUF','PREOMALA','PREOIMMI','PREOEURO','PREOENV',
    'PREODROG','PREOCONF','PREOCHOM','PREOCCU2','PREOCCU1',
    'PREFPALI','PRATCOLL','popdense','pmun','PCSENQ7','PCSENQ36',
    'PCSCON7','ORDLIB','OPIRSA','OPIIMMIG','OPICULT',
    'NOT_PROF','NOT_POLI','NOT_LOG','NOT_LIBR','NOT_FAMI',
    'NOT_COHE','NOT_CAD','NOT_AMIS','NIVPERSO','NIVPERS4',
    'NIVFRAN4','NBUC','NBPIECE6','NBPERS5','NBPERS','NBHEUR35',
    'NBENF6','NBCHOM','NB99_4','NB2099','NB20_4','NB1620',
    'NB16_4','NB1016','NB10_4','NB0610','NB06_4','NB0306','NB03_4',
    'NB0003','MONDIAL','med','LOGSUFFI','LIMVIAND',
    'LIEN_9','LIEN_8','LIEN_7','LIEN_6','LIEN_5','LIEN_4','LIEN_3',
    'LIEN_2','JUSTICE','ISEGO','INTERIM','INQROUTE',
    'INQNUCLE','INQMALAD','INQGUERR','INQCHOMA','INQALIM',
    'INQAGRES','INQ4SUR6','INNOVTEC','info','IMAGTRAV','i',
    'HEUREUX','HARVEY','HANDICAP','FREQTELE','FREQSPOR','FREQCINE',
    'FREQBIBL','FAMILLE','EXERCPRO','ETATSAN','ENFANTS','EFFORTPP',
    'ECHPOL','DIPLOME','DEPLOG3','cpt','CP','COUPLE','CONFWEB','CONFPUB',
    'CONFPRES','CONFPOLI','CONFMEFI',
    'CONFKEUF','CONFGOUV','CONFENTR','CONFECOL','CONFBANK',
    'CONFASSO','CONDUIT','COMMU8','COMMU7','COMMU6','COMMU5',
    'COMMU4','COMMU3','COMMU2','COMMU1','com','CLASSESO',
    'CHOVOLON','CHOIXNUC','CHOAVANT','CHERCHEM','CDV5_4','CDV5',
    'CADVIE','BANQVIE','BANQMOB','BANQEPA','AUTREREV','AUTREAL','ASSOSYND','ASSOSPOR',
    'ASSOPOLI','ASSOPARE','ASSOJEUN','ASSOHUMA','ASSOENVI',
    'ASSOCULT','ASSOCONS','ASSOCONF','ASSOAUTR','ASSO6_2','ASSO11_3','ASSO11_2','ASSO10_3',
    'ASSO10_2','AIDESUFF','AGGLOINS','AGGLO9','AGESEX12','agedip','AGE6',
    'AGE5','age_OW','AGE_9','AGE_8','AGE_7',
    'AGE_6','AGE_5','AGE_4','AGE_3','AGE_2','AGE','ADOPTGAY','ADNSTIC','ADNORDI',
    'ADNCB','ACM9','ACM8','ACM7','ACM6',
    'ACM5','ACM4','ACM3','ACM2','ACM12','ACM11','ACM10','ACM1', 'COMINSEE'
}

### Categorial variable

In [10]:
obj_cdv = cdv.select_dtypes(include=['object'])
obj_var = set(obj_cdv.columns)
cat_max9_var = set()
cat_min10_var = set()
for c in obj_var:
    obj_cdv_valcpt = obj_cdv[c].value_counts()
    if len(obj_cdv_valcpt) > 10:
        cat_min10_var.add(c)
    else:
        cat_max9_var.add(c)

In [11]:
ord_var = {
    "CONFPOLI", "AGE5","SECURITE", "ACM7","INQGUERR", "NBPIECE6","INNOVTEC",
    "JUSTICE","EFFORTPP","ACM10","NIVFRAN4","NBPERS5","INQCHOMA","CDV5_4",
    "CONFGOUV","ADOPTGAY","ACM8","FREQCINE","CONFPUB","FREQSPOR","INQALIM",
    "ASSO10_3", "FREQBIBL","DEPLOG","NBCHOM","CONFENTR","ORDLIB","ACM5",
    "INQMALAD","FREQTELE","NBENF6","ACM9","revtot7","INQROUTE","NIVPERS4",
    "ETATSAN","INQNUCLE","NIVPERSO","CONFASSO","ACM6","CDV5","UNIONGAY",
    "ACM4","INQAGRES","CADVIE","NIVFRAN","REV_TR7","ISEGO","RECEP","AGE6",
    "ADNCB","PRATCOLL","NBHEUR39","HARVEY","QUOTAAGE","NBHEUR35","RELEG",
    "CONFKEUF","CONFECOL","ADNSTIC","ADNORDI","CONFPRES","CONFWEB","CONFBANK"
}

In [12]:
# exclusion of features with order
cat_var = obj_var - ord_var
cat_max9_var = cat_max9_var - ord_var
cat_min10_var = cat_min10_var - ord_var

In [13]:
cdv_dtypes = cdv.dtypes

In [14]:
int_var = set(cdv_dtypes[cdv_dtypes == 'int64'].index)

In [15]:
int_cat_var = {
    'NB0003','NB0306','NB0610','NB1016',
    'NB1620','NB2099', 'REVTOT6','ANNEEFUZ','INTER'
}
int_cat_max9_var = {
    'NB0003','NB0306','NB0610','NB1016',
    'NB1620','NB2099', "REVTOT6","ANNEEFUZ"}
int_cat_min10_var = {
    'INTER'
}
int_quant_var = {
    'AGE','REVENQ', 'AUTREREV',
    'an_enq','an_nais' 
}

In [16]:
cat_var = cat_var | int_cat_var
cat_max9_var = cat_max9_var | int_cat_max9_var
cat_min10_var = cat_min10_var | int_cat_min10_var
quant_var = ord_var | int_quant_var

In [17]:
float_var = set(cdv_dtypes[cdv_dtypes == 'float64'].index)

In [18]:
float_cat_min10_var = {'CP','inseenum'}
float_cat_max9_var = {'refus2','cpt','prescaf','i','age_OW','TYPLOG','AGGLOINS','CSP6'}
float_cat_var = float_cat_min10_var | float_cat_max9_var
float_quant_var = float_var - float_cat_var

In [19]:
cat_var = cat_var | float_cat_var
cat_max9_var = cat_max9_var | float_cat_max9_var
cat_min10_var = cat_min10_var | float_cat_min10_var
quant_var = quant_var | float_quant_var

In [20]:
print(f"out of the {cdv.shape[1]} variable :")
print(f"{len(cat_var)} variables are categorial ")
print(f"{len(quant_var)} variables are quantitative ")

out of the 353 variable :
247 variables are categorial 
106 variables are quantitative 


In [21]:
print(f"out of the {len(cat_var)} variable categorial:")
print(f"{len(cat_max9_var)} variables have maximum 9 modalities  ")
print(f"{len(cat_min10_var)} variables have more ")

out of the 247 variable categorial:
221 variables have maximum 9 modalities  
26 variables have more 


### Adding communal features and levers

In [22]:
# loading MergeCommunesEnvi data
file = path_data / Path("MergeCommunesEnvi.csv")
with Path.open(file, 'rb') as fp:
    MergeCommunesEnvi = pd.read_csv(fp,  encoding='cp1252',low_memory=False, sep=';', index_col = 1)

In [23]:
insee_var = set(MergeCommunesEnvi.columns) - set(cdv.columns)

In [24]:
df = MergeCommunesEnvi.loc[:,insee_var]
insee_quant_var = set(df.select_dtypes(include=['float64']).columns)

### Adding work on features of September


In [25]:
# loadind xlsx file with agreement data 
file = path_data / Path("Base of Actionable Var. - Survey Data.xlsx")
with Path.open(file, 'rb') as fp:
    agreement = pd.read_excel(fp, 
                        sheetname='List 1 Actionable Individual',
                        parse_cols="C,H",
                        index_col=0
                       )

In [26]:
agreement.head()

Unnamed: 0_level_0,Agreement
Variables,Unnamed: 1_level_1
INTER6,4
INTER,4
ANNEEFUZ,4
ANNEFUZ2,4
COLLECTE,4


In [27]:
cdv_actionable_individual_1_var = set(agreement.loc[agreement.loc[:,"Agreement"]==1,:].index)
cdv_actionable_individual_2_var = set(agreement.loc[agreement.loc[:,"Agreement"]==2,:].index)
cdv_actionable_individual_3_var = set(agreement.loc[agreement.loc[:,"Agreement"]==3,:].index)
cdv_actionable_individual_4_var = set(agreement.loc[agreement.loc[:,"Agreement"]==4,:].index)

In [28]:
# loadind xlsx file with agreement data 
file = path_data / Path("Base of Actionable Var. - Survey Data.xlsx")
with Path.open(file, 'rb') as fp:
    agreement = pd.read_excel(fp, 
                        sheetname='List 2 Actionable Admin',
                        parse_cols="C,H",
                        index_col=0
                       )

In [29]:
agreement.head()

Unnamed: 0_level_0,Agreement
Variables,Unnamed: 1_level_1
INTER6,4
INTER,4
ANNEEFUZ,4
ANNEFUZ2,4
COLLECTE,4


In [30]:
cdv_actionable_admin_1_var = set(agreement.loc[agreement.loc[:,"Agreement"]==1,:].index)
cdv_actionable_admin_2_var = set(agreement.loc[agreement.loc[:,"Agreement"]==2,:].index)
cdv_actionable_admin_3_var = set(agreement.loc[agreement.loc[:,"Agreement"]==3,:].index)
cdv_actionable_admin_4_var = set(agreement.loc[agreement.loc[:,"Agreement"]==4,:].index)
cdv_actionable_admin_5_var = set(agreement.loc[agreement.loc[:,"Agreement"]==5,:].index)

In [31]:
# loadind xlsx file with agreement data 
file = path_data / Path("Base Admin Action. Var. - Recreation.xlsx")
with Path.open(file, 'rb') as fp:
    agreement = pd.read_excel(fp, 
                              sheetname='Actionable Variables',
                              parse_cols="A,E",
                              index_col=0,
                              skiprows=[0,1]
                             )

In [32]:
agreement.head()

Unnamed: 0_level_0,Agreement
VAR,Unnamed: 1_level_1
CODGEO,4
LIBGEO,4
REG,4
DEP,4
NB_F101,1


In [33]:
insee_recreation_var = set(agreement.index)

In [34]:
insee_recreation_actionable_admin_1_var = set(agreement.loc[agreement.loc[:,"Agreement"]==1,:].index)
insee_recreation_actionable_admin_2_var = set(agreement.loc[agreement.loc[:,"Agreement"]==2,:].index)
insee_recreation_actionable_admin_3_var = set(agreement.loc[agreement.loc[:,"Agreement"]==3,:].index)
insee_recreation_actionable_admin_4_var = set(agreement.loc[agreement.loc[:,"Agreement"]==4,:].index)
insee_recreation_actionable_admin_5_var = set(agreement.loc[agreement.loc[:,"Agreement"]==5,:].index)

In [35]:
# loadind xlsx file with agreement data 
file = path_data / Path("Base Admin Action. Var. - Environment.xlsx")
with Path.open(file, 'rb') as fp:
    agreement = pd.read_excel(fp, 
                              sheetname='Actionable Variables',
                              parse_cols="A,E",
                              index_col=0,
                              skiprows=[0,1,2,3,4]
                             )

In [36]:
agreement.head(8)

Unnamed: 0_level_0,Agreement
VAR,Unnamed: 1_level_1
code,4
communes,4
Superficie.protection.forte...2017..ha.,4
Part.protection.forte...2017....,1
Superficie.protection.contractuelle...2017..ha.,4
Part.protection.contractuelle...2017....,1
Superficie.forêts.et.milieux.semi.naturels...2012..ha.,4
Part.forêts.et.milieux.semi.naturels...2012....,2


In [37]:
insee_environment_var = set(agreement.index)

In [38]:
insee_environment_actionable_admin_1_var = set(agreement.loc[agreement.loc[:,"Agreement"]==1,:].index)
insee_environment_actionable_admin_2_var = set(agreement.loc[agreement.loc[:,"Agreement"]==2,:].index)
insee_environment_actionable_admin_3_var = set(agreement.loc[agreement.loc[:,"Agreement"]==3,:].index)
insee_environment_actionable_admin_4_var = set(agreement.loc[agreement.loc[:,"Agreement"]==4,:].index)
insee_environment_actionable_admin_5_var = set(agreement.loc[agreement.loc[:,"Agreement"]==5,:].index)

In [39]:
# loadind xlsx file with agreement data 
file = path_data / Path("Base Admin Action. Var. - Demographics.xlsx")
with Path.open(file, 'rb') as fp:
    agreement = pd.read_excel(fp, 
                              sheetname='Actionable Variables',
                              parse_cols="B,G",
                              index_col=0,
                              skiprows=[0,1,2,3,4,5,6]
                             )

In [40]:
agreement.head()

Unnamed: 0_level_0,Agreement
VAR_ID,Unnamed: 1_level_1
CODGEO,4
LIBGEO,4
REG,4
DEP,4
P15_POP,3


In [41]:
insee_demographics_var = set(agreement.index)

In [42]:
insee_demographics_actionable_admin_1_var = set(agreement.loc[agreement.loc[:,"Agreement"]==1,:].index)
insee_demographics_actionable_admin_2_var = set(agreement.loc[agreement.loc[:,"Agreement"]==2,:].index)
insee_demographics_actionable_admin_3_var = set(agreement.loc[agreement.loc[:,"Agreement"]==3,:].index)
insee_demographics_actionable_admin_4_var = set(agreement.loc[agreement.loc[:,"Agreement"]==4,:].index)
insee_demographics_actionable_admin_5_var = set(agreement.loc[agreement.loc[:,"Agreement"]==5,:].index)

In [43]:
MergeCommunesEnvi.shape

(11131, 571)

#### Adding calculated variable

In [44]:
# calculation of scores linked to environment
# score_protection_forte [3,6,40,60]
MergeCommunesEnvi['score_protection_forte']=1
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.protection.forte...2017....']>3,
                      'score_protection_forte']=2
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.protection.forte...2017....']>6,
                      'score_protection_forte']=3
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.protection.forte...2017....']>40,
                      'score_protection_forte']=4
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.protection.forte...2017....']>60,
                      'score_protection_forte']=5

# score_protection_contractuelle [1,5,10,15]
MergeCommunesEnvi['score_protection_contractuelle']=1
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.protection.contractuelle...2017....']>1,
                      'score_protection_contractuelle']=2
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.protection.contractuelle...2017....']>5,
                      'score_protection_contractuelle']=3
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.protection.contractuelle...2017....']>10,
                      'score_protection_contractuelle']=4
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.protection.contractuelle...2017....']>15,
                      'score_protection_contractuelle']=5


# score_part_naturel [15,20,25,30]
MergeCommunesEnvi['score_nature']=1
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.forêts.et.milieux.semi.naturels...2012....']>15,
                      'score_nature']=2
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.forêts.et.milieux.semi.naturels...2012....']>20,
                      'score_nature']=3
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.forêts.et.milieux.semi.naturels...2012....']>25,
                      'score_nature']=4
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.forêts.et.milieux.semi.naturels...2012....']>30,
                      'score_nature']=5



# score_protection_forte [1,5,10,20]
MergeCommunesEnvi['score_zone_humide']=1
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.zones.humides.et.surfaces.en.eau...2012....']>1,
                      'score_zone_humide']=2
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.zones.humides.et.surfaces.en.eau...2012....']>5,
                      'score_zone_humide']=3
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.zones.humides.et.surfaces.en.eau...2012....']>10,
                      'score_zone_humide']=4
MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,'Part.zones.humides.et.surfaces.en.eau...2012....']>20,
                      'score_zone_humide']=5

# score_environnement
MergeCommunesEnvi['score_environnement']= MergeCommunesEnvi['score_zone_humide'] \
+ MergeCommunesEnvi['score_nature'] \
+ MergeCommunesEnvi['score_protection_contractuelle'] \
+ MergeCommunesEnvi['score_protection_forte']

In [45]:
insee_environment_score_var = {
    'score_environnement','score_zone_humide','score_nature',
    'score_protection_contractuelle','score_protection_forte'
}

In [46]:
#leasure infrastructure score

In [47]:
recreation_var = {
    'NB_F101_NB_AIREJEU',
    'NB_F102_NB_AIREJEU',
    'NB_F103_NB_AIREJEU',
    'NB_F104_NB_AIREJEU',
    'NB_F105_NB_AIREJEU',
    'NB_F106_NB_AIREJEU',
    'NB_F107_NB_AIREJEU',
    'NB_F108_NB_AIREJEU',
    'NB_F109_NB_AIREJEU',
    'NB_F110_NB_AIREJEU',
    'NB_F111_NB_AIREJEU',
    'NB_F112_NB_AIREJEU',
    'NB_F113_NB_AIREJEU',
    'NB_F114_NB_AIREJEU',
    'NB_F116_NB_AIREJEU',
    'NB_F117_NB_AIREJEU',
    'NB_F118_NB_AIREJEU',
    'NB_F119_NB_AIREJEU',
    'NB_F120_NB_AIREJEU',
    'NB_F121_NB_AIREJEU',
    'NB_F201_NB_AIREJEU',
    'NB_F202_NB_AIREJEU',
    'NB_F203_NB_AIREJEU',
    'NB_F302',
    'NB_F303',
    'NB_F304',
    'NB_F305'   
}
for col in recreation_var:
    MergeCommunesEnvi[col+'_FLG']= 0
    MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,col]>0,col+'_FLG']= 1
MergeCommunesEnvi['recreation_score']= 0
for col in recreation_var:
    MergeCommunesEnvi['recreation_score'] += MergeCommunesEnvi[col+'_FLG']

In [48]:
police_var = {'NB_A101', 'NB_A104'}
for col in police_var:
    MergeCommunesEnvi[col+'_FLG']= 0
    MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,col]>0,col+'_FLG']= 1
MergeCommunesEnvi['police_score']= 0
for col in police_var:
    MergeCommunesEnvi['police_score'] += MergeCommunesEnvi[col+'_FLG']

In [49]:
justice_var = {
    'NB_A105', 'NB_A106', 'NB_A107', 'NB_A108', 'NB_A109',
    'NB_A115','NB_A119','NB_A120','NB_A121','NB_A122','NB_A123','NB_A124','NB_A125','NB_A126'
}
for col in justice_var:
    MergeCommunesEnvi[col+'_FLG']= 0
    MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,col]>0,col+'_FLG']= 1
MergeCommunesEnvi['justice_score']= 0
for col in justice_var:
    MergeCommunesEnvi['justice_score'] += MergeCommunesEnvi[col+'_FLG']

In [50]:
service_var = {
    'NB_A203','NB_A205','NB_A206','NB_A207','NB_A208',
    'NB_A301','NB_A302','NB_A303','NB_A304',
    'NB_A401','NB_A402','NB_A403','NB_A404','NB_A405','NB_A406',
    'NB_A501','NB_A502','NB_A503','NB_A504','NB_A505','NB_A506','NB_A507'    
}
for col in service_var:
    MergeCommunesEnvi[col+'_FLG']= 0
    MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,col]>0,col+'_FLG']= 1
MergeCommunesEnvi['service_score']= 0
for col in service_var:
    MergeCommunesEnvi['service_score'] += MergeCommunesEnvi[col+'_FLG']

In [51]:
senior_var = {
    'NB_D401','NB_D402','NB_D403','NB_D404','NB_D405'
}
for col in senior_var:
    MergeCommunesEnvi[col+'_FLG']= 0
    MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,col]>0,col+'_FLG']= 1
MergeCommunesEnvi['senior_score']= 0
for col in senior_var:
    MergeCommunesEnvi['senior_score'] += MergeCommunesEnvi[col+'_FLG']

In [52]:
child_var = {
    'NB_D502','NB_D701','NB_D702'
}
for col in child_var:
    MergeCommunesEnvi[col+'_FLG']= 0
    MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,col]>0,col+'_FLG']= 1
MergeCommunesEnvi['child_score']= 0
for col in child_var:
    MergeCommunesEnvi['child_score'] += MergeCommunesEnvi[col+'_FLG']

In [53]:
disability_var = {
    'NB_D601','NB_D602','NB_D603','NB_D604','NB_D605','NB_D606'
}
for col in disability_var:
    MergeCommunesEnvi[col+'_FLG']= 0
    MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,col]>0,col+'_FLG']= 1
    
MergeCommunesEnvi['disability_score']= 0
for col in disability_var:
    MergeCommunesEnvi['disability_score'] += MergeCommunesEnvi[col+'_FLG']

In [54]:
rehabilitation_var = {
    'NB_D703','NB_D704','NB_D705','NB_D709'
}
for col in rehabilitation_var:
    MergeCommunesEnvi[col+'_FLG']= 0
    MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,col]>0,col+'_FLG']= 1
MergeCommunesEnvi['rehabilitation_score']= 0
for col in rehabilitation_var:
    MergeCommunesEnvi['rehabilitation_score'] += MergeCommunesEnvi[col+'_FLG']

In [55]:
health_var = {
    'NB_D101','NB_D102','NB_D103','NB_D104','NB_D105',
    'NB_D106','NB_D107','NB_D108','NB_D109','NB_D110',
    'NB_D111','NB_D112','NB_D113',
    'NB_D301','NB_D302','NB_D303','NB_D304'
}
for col in health_var:
    MergeCommunesEnvi[col+'_FLG']= 0
    MergeCommunesEnvi.loc[MergeCommunesEnvi.loc[:,col]>0,col+'_FLG']= 1
MergeCommunesEnvi['health_score']= 0
for col in health_var:
    MergeCommunesEnvi['health_score'] += MergeCommunesEnvi[col+'_FLG']

In [56]:
insee_recreation_score_var = {
    'recreation_score','police_score','justice_score',
    'service_score','senior_score','child_score',
    'disability_score','rehabilitation_score','health_score'
}

In [57]:
var = recreation_var | police_var | justice_var | service_var | senior_var
var = var | child_var | disability_var | rehabilitation_var| health_var
var_FLG = {v+'_FLG' for v in var}
additional_var = insee_environment_score_var | insee_recreation_score_var | var_FLG

In [58]:
len(quant_var)

106

In [59]:
all_var = cdv_var | insee_var | additional_var
scope_2015_var = cdv_2015_var | insee_var | additional_var
scope_2016_var = cdv_2016_var | insee_var | additional_var
scope_2017_var = cdv_2017_var | insee_var | additional_var
scope_2018_var = cdv_2018_var | insee_var | additional_var
scope_2015_2018_var = cdv_2015_2018_var | insee_var | additional_var
scope_2016_2018_var = cdv_2016_2018_var | insee_var | additional_var
scope_2017_2018_var = cdv_2017_2018_var | insee_var | additional_var

synthesis_var = non_redundant_cdv_var | insee_environment_score_var | insee_recreation_score_var

cat_var = cat_var | var_FLG | {'DEP', 'LIBGEO', 'communes'}
cat_max9_var = cat_max9_var | var_FLG
cat_min10_var = cat_min10_var | {'DEP', 'LIBGEO','communes'}
quant_var = quant_var | insee_quant_var | insee_environment_score_var | insee_recreation_score_var

exclusion_var = com_var | tech_var | bizz_var | text_var 

# usual scopes
# suppresion of quantitative variables with more than 200 NaN for usual scope
quant_null = np.sum(MergeCommunesEnvi.loc[:,quant_var].isnull())
quant_kept_var = set(quant_null[quant_null < 200].index)
usual_common_scope_var = ((cat_max9_var | quant_kept_var) & scope_2015_2018_var) - exclusion_var
usual_synthetic_scope_var = usual_common_scope_var & synthesis_var

In [60]:
dict_var_groups = {
    'insee_var' : insee_var,
    'cdv_var' : cdv_var,
    'additional_var' : additional_var,
    'all_var' : all_var,
    'insee_demographics_var' : insee_demographics_var,
    'insee_recreation_var' : insee_recreation_var,
    'insee_environment_var' : insee_environment_var,
    'cdv_2015_var' : cdv_2015_var,
    'cdv_2016_var' : cdv_2016_var,
    'cdv_2017_var' : cdv_2017_var,
    'cdv_2018_var' : cdv_2018_var,
    'cdv_2015_2018_var' : cdv_2015_2018_var,
    'cdv_2016_2018_var' : cdv_2016_2018_var,
    'cdv_2017_2018_var' : cdv_2017_2018_var,
    'pred_var' : pred_var,
    'tech_var' : tech_var,
    'com_var' : com_var,
    'text_var' : text_var,
    'bizz_var' : bizz_var,
    'calc_cdv_var' : calc_cdv_var,
    'non_redundant_cdv_var' : non_redundant_cdv_var,
    'scope_2015_var' : scope_2015_var,
    'scope_2016_var' : scope_2016_var,
    'scope_2017_var' : scope_2017_var,
    'scope_2018_var' : scope_2018_var,
    'scope_2015_2018_var' : scope_2015_2018_var,
    'scope_2016_2018_var' : scope_2016_2018_var,
    'scope_2017_2018_var' : scope_2017_2018_var,
    'cat_var' : cat_var,
    'cat_max9_var' : cat_max9_var,
    'cat_min10_var' : cat_min10_var,
    'quant_var' : quant_var,
    'exclusion_var' : exclusion_var,
    'quant_kept_var' : quant_kept_var,
    'cdv_actionable_individual_1_var' : cdv_actionable_individual_1_var,
    'cdv_actionable_individual_2_var': cdv_actionable_individual_2_var,
    'cdv_actionable_individual_3_var' : cdv_actionable_individual_3_var,
    'cdv_actionable_individual_4_var' : cdv_actionable_individual_4_var,
    'cdv_actionable_admin_1_var': cdv_actionable_admin_1_var,
    'cdv_actionable_admin_2_var' : cdv_actionable_admin_2_var,
    'cdv_actionable_admin_3_var' : cdv_actionable_admin_3_var,
    'cdv_actionable_admin_4_var' : cdv_actionable_admin_4_var,
    'cdv_actionable_admin_5_var' : cdv_actionable_admin_5_var,
    'insee_recreation_actionable_admin_1_var' : insee_recreation_actionable_admin_1_var,
    'insee_recreation_actionable_admin_2_var' : insee_recreation_actionable_admin_2_var,
    'insee_recreation_actionable_admin_3_var' : insee_recreation_actionable_admin_3_var,
    'insee_recreation_actionable_admin_4_var' : insee_recreation_actionable_admin_4_var,
    'insee_recreation_actionable_admin_5_var' : insee_recreation_actionable_admin_5_var,
    'insee_environment_actionable_admin_1_var' : insee_environment_actionable_admin_1_var,
    'insee_environment_actionable_admin_2_var' : insee_environment_actionable_admin_2_var,
    'insee_environment_actionable_admin_3_var' : insee_environment_actionable_admin_3_var,
    'insee_environment_actionable_admin_4_var' : insee_environment_actionable_admin_4_var,
    'insee_environment_actionable_admin_5_var' : insee_environment_actionable_admin_5_var,
    'insee_demographics_actionable_admin_1_var' : insee_demographics_actionable_admin_1_var,
    'insee_demographics_actionable_admin_2_var' : insee_demographics_actionable_admin_2_var,
    'insee_demographics_actionable_admin_3_var' : insee_demographics_actionable_admin_3_var,
    'insee_demographics_actionable_admin_4_var' : insee_demographics_actionable_admin_4_var,
    'insee_demographics_actionable_admin_5_var' : insee_demographics_actionable_admin_5_var,
    'insee_environment_score_var' : insee_environment_score_var,
    'insee_recreation_score_var' : insee_recreation_score_var,
    'usual_common_scope_var' : usual_common_scope_var,
    'usual_synthetic_scope_var' : usual_synthetic_scope_var    
}

In [61]:
[(k, len(dict_var_groups[k]),len(dict_var_groups[k]&usual_common_scope_var)) for k in dict_var_groups.keys()]

[('insee_var', 218, 203),
 ('cdv_var', 353, 204),
 ('additional_var', 114, 114),
 ('all_var', 685, 521),
 ('insee_demographics_var', 36, 29),
 ('insee_recreation_var', 178, 174),
 ('insee_environment_var', 10, 0),
 ('cdv_2015_var', 271, 204),
 ('cdv_2016_var', 280, 204),
 ('cdv_2017_var', 297, 204),
 ('cdv_2018_var', 352, 204),
 ('cdv_2015_2018_var', 267, 204),
 ('cdv_2016_2018_var', 280, 204),
 ('cdv_2017_2018_var', 296, 204),
 ('pred_var', 1, 1),
 ('tech_var', 7, 0),
 ('com_var', 6, 0),
 ('text_var', 1, 0),
 ('bizz_var', 42, 0),
 ('calc_cdv_var', 0, 0),
 ('non_redundant_cdv_var', 278, 169),
 ('scope_2015_var', 603, 521),
 ('scope_2016_var', 612, 521),
 ('scope_2017_var', 629, 521),
 ('scope_2018_var', 684, 521),
 ('scope_2015_2018_var', 599, 521),
 ('scope_2016_2018_var', 612, 521),
 ('scope_2017_2018_var', 628, 521),
 ('cat_var', 350, 251),
 ('cat_max9_var', 321, 251),
 ('cat_min10_var', 29, 0),
 ('quant_var', 335, 270),
 ('exclusion_var', 56, 0),
 ('quant_kept_var', 280, 270),
 ('c

In [62]:
filename = path_dump / Path("dict_var_groups.sav")
with open(filename, 'wb') as fp:
     pickle.dump(dict_var_groups,fp,pickle.HIGHEST_PROTOCOL)

In [63]:
MergeCommunesEnvi.shape

(11131, 685)

In [64]:
var_FLG in usual_common_scope_var

False

#### Dataset

In [65]:
df = MergeCommunesEnvi.loc[:,:]
df.loc[:,cdv_ssfmt.columns] = cdv_ssfmt.loc[:,:]
# scope limited to usual_common_scope_var
scope = usual_common_scope_var
print(f"scope of {len(scope)} variables")
df = df.loc[:,scope]
# getting string values for categorial variables of CVD to name columns
cat_cdv_scope = (cat_var & scope & cdv_var) - {'HEUREUX'}    
df.loc[:,cat_cdv_scope] = cdv.loc[:,cat_cdv_scope]

dataset = pd.get_dummies(
    df, 
    columns= cat_var & usual_common_scope_var - {"HEUREUX"},
    dummy_na = True,
    drop_first=1
)
print(f"{dataset.shape[1]} columns after encoding of {len((cat_var & usual_common_scope_var))-1} \
categorial variables in {len((cat_var & usual_common_scope_var))-1+dataset.shape[1]-df.shape[1]} \
binary variables (K-1 one hot encoding)")

scope of 521 variables
1024 columns after encoding of 250 categorial variables in 753 binary variables (K-1 one hot encoding)


In [66]:
# saving dataset data
file = path_data / Path("dataset.csv")
with Path.open(file, 'w') as fp:
    dataset.to_csv(fp,  encoding='utf-8')

In [67]:
idx_2017_2018 = MergeCommunesEnvi.loc[MergeCommunesEnvi['ANNEEFUZ'].isin([39,40]),:].index
df = MergeCommunesEnvi.loc[idx_2017_2018,:]
df.loc[:,cdv_ssfmt.columns] = cdv_ssfmt.loc[idx_2017_2018,:]


f_2017_2018 = ((cat_max9_var | quant_kept_var) & scope_2017_2018_var) - exclusion_var

df = df.loc[:,f_2017_2018]
df.loc[:,(cat_var & cdv_var & f_2017_2018 ) - {"HEUREUX"}] = cdv.loc[idx_2017_2018,
                                                                     (cat_var & cdv_var & f_2017_2018) - {"HEUREUX"}]


dataset_2017_2018 = pd.get_dummies(
    df, 
    columns=(cat_var & f_2017_2018) - {"HEUREUX"},
    dummy_na = True,
    drop_first=1
)

print(f"{dataset_2017_2018.shape[1]} columns after encoding of {len((cat_var & f_2017_2018))-1} \
categorial variables in {len((cat_var & f_2017_2018))-1+dataset_2017_2018.shape[1]-df.shape[1]} \
binary variables (K-1 one hot encoding)")

1036 columns after encoding of 259 categorial variables in 765 binary variables (K-1 one hot encoding)


In [68]:
# saving dataset data
file = path_data / Path("dataset_2017_2018.csv")
with Path.open(file, 'w') as fp:
    dataset_2017_2018.to_csv(fp,  encoding='utf-8')

#### Construction of usefull feature sets
Including ...
Lasso or other feature selection methods, ....


In [69]:
dict_features_sets = dict()

In [70]:
list_var_set = [    
    'all_var',
    'cdv_var',
    'additional_var',
    'insee_var',
    'insee_demographics_var',
    'insee_recreation_var',
    'insee_environment_var',
    'cat_var',
    'quant_var',
    'calc_cdv_var',
    'non_redundant_cdv_var',
    'cdv_actionable_individual_1_var',
    'cdv_actionable_individual_2_var',
    'cdv_actionable_individual_3_var',
    'cdv_actionable_individual_4_var',
    'cdv_actionable_admin_1_var',
    'cdv_actionable_admin_2_var',
    'cdv_actionable_admin_3_var',
    'cdv_actionable_admin_4_var',
    'cdv_actionable_admin_5_var',
    'insee_recreation_actionable_admin_1_var',
    'insee_recreation_actionable_admin_2_var',
    'insee_recreation_actionable_admin_3_var',
    'insee_recreation_actionable_admin_4_var',
    'insee_recreation_actionable_admin_5_var',
    'insee_environment_actionable_admin_1_var',
    'insee_environment_actionable_admin_2_var',
    'insee_environment_actionable_admin_3_var',
    'insee_environment_actionable_admin_4_var',
    'insee_environment_actionable_admin_5_var',
    'insee_demographics_actionable_admin_1_var',
    'insee_demographics_actionable_admin_2_var',
    'insee_demographics_actionable_admin_3_var',
    'insee_demographics_actionable_admin_4_var',
    'insee_demographics_actionable_admin_5_var',
    'insee_environment_score_var',
    'insee_recreation_score_var',
    'usual_common_scope_var',
    'usual_synthetic_scope_var'
]

In [71]:
# usefull subset of features of usual_common_scope_features (based on usual_common_scope_var)
for var_set in list_var_set:
    print(f"-----{var_set}------")
    # starting with 533 variable of MergeCommunesEnvi including additional var
    df = MergeCommunesEnvi.loc[:,:]
    df.loc[:,cdv_ssfmt.columns] = cdv_ssfmt.loc[:,:]
    scope = dict_var_groups[var_set] & usual_common_scope_var    
    print(f"scope of len {len(scope)}")
    df = df.loc[:,scope]
    # getting expliciyte names ...
    cat_cdv_scope = (cat_var & scope & cdv_var) - {'HEUREUX'}    
    df.loc[:,cat_cdv_scope] = cdv.loc[:,cat_cdv_scope]
    print(df.shape)
    if cat_var & scope - {'HEUREUX'}:
        print(f"{len(cat_var & scope)} categorial variables in scope")
        df_dummies = pd.get_dummies(
            df, 
            columns=(cat_var & scope) - {'HEUREUX'},
            dummy_na = True,
            drop_first=1
        )
        
    else:
        print(f"No categorial variables in scope")
        df_dummies = df 
    if df_dummies.shape[1] > 0:
        dict_features_sets[var_set[:-3]+'features'] = set(df_dummies.columns)
    else :
        print(f"empty scope for {var_set[:-3]+'features'}")
    print(f"{df_dummies.shape[1]} columns after encoding of {len((cat_var & scope - {'HEUREUX'}))} categorial \
variables in {len((cat_var & scope))+df_dummies.shape[1]-df.shape[1]} binary variables \
(K-1 one hot encoding)")    

-----all_var------
scope of len 521
(11131, 521)
251 categorial variables in scope
1024 columns after encoding of 250 categorial variables in 754 binary variables (K-1 one hot encoding)
-----cdv_var------
scope of len 204
(11131, 204)
151 categorial variables in scope
607 columns after encoding of 150 categorial variables in 554 binary variables (K-1 one hot encoding)
-----additional_var------
scope of len 114
(11131, 114)
100 categorial variables in scope
214 columns after encoding of 100 categorial variables in 200 binary variables (K-1 one hot encoding)
-----insee_var------
scope of len 203
(11131, 203)
No categorial variables in scope
203 columns after encoding of 0 categorial variables in 0 binary variables (K-1 one hot encoding)
-----insee_demographics_var------
scope of len 29
(11131, 29)
No categorial variables in scope
29 columns after encoding of 0 categorial variables in 0 binary variables (K-1 one hot encoding)
-----insee_recreation_var------
scope of len 174
(11131, 174)
N

In [72]:
[(k,len(dict_features_sets[k]), 
  len(dict_var_groups[k[:-8]+'var']&usual_common_scope_var),
  len(dict_var_groups[k[:-8]+'var'])) for k in dict_features_sets.keys()]

[('all_features', 1024, 521, 685),
 ('cdv_features', 607, 204, 353),
 ('additional_features', 214, 114, 114),
 ('insee_features', 203, 203, 218),
 ('insee_demographics_features', 29, 29, 36),
 ('insee_recreation_features', 174, 174, 178),
 ('cat_features', 754, 251, 350),
 ('quant_features', 270, 270, 335),
 ('non_redundant_cdv_features', 478, 169, 278),
 ('cdv_actionable_individual_1_features', 193, 87, 145),
 ('cdv_actionable_individual_2_features', 230, 74, 89),
 ('cdv_actionable_individual_3_features', 115, 30, 49),
 ('cdv_actionable_individual_4_features', 69, 13, 71),
 ('cdv_actionable_admin_1_features', 201, 92, 141),
 ('cdv_actionable_admin_2_features', 198, 61, 84),
 ('cdv_actionable_admin_3_features', 60, 18, 24),
 ('cdv_actionable_admin_4_features', 76, 14, 77),
 ('cdv_actionable_admin_5_features', 72, 19, 28),
 ('insee_recreation_actionable_admin_1_features', 166, 166, 167),
 ('insee_recreation_actionable_admin_3_features', 7, 7, 7),
 ('insee_recreation_actionable_admin_4_f

In [73]:
filename = path_dump / Path("dict_features_sets.sav")
with open(filename, 'wb') as fp:
     pickle.dump(dict_features_sets,fp,pickle.HIGHEST_PROTOCOL)

In [74]:
[k for k in dict_features_sets.keys()]

['all_features',
 'cdv_features',
 'additional_features',
 'insee_features',
 'insee_demographics_features',
 'insee_recreation_features',
 'cat_features',
 'quant_features',
 'non_redundant_cdv_features',
 'cdv_actionable_individual_1_features',
 'cdv_actionable_individual_2_features',
 'cdv_actionable_individual_3_features',
 'cdv_actionable_individual_4_features',
 'cdv_actionable_admin_1_features',
 'cdv_actionable_admin_2_features',
 'cdv_actionable_admin_3_features',
 'cdv_actionable_admin_4_features',
 'cdv_actionable_admin_5_features',
 'insee_recreation_actionable_admin_1_features',
 'insee_recreation_actionable_admin_3_features',
 'insee_recreation_actionable_admin_4_features',
 'insee_demographics_actionable_admin_1_features',
 'insee_demographics_actionable_admin_2_features',
 'insee_demographics_actionable_admin_3_features',
 'insee_demographics_actionable_admin_4_features',
 'insee_demographics_actionable_admin_5_features',
 'insee_environment_score_features',
 'insee_rec

#### Feature selection and and results recording 

In [75]:
# reducing problem to a 2 class classification problem
df = dataset.loc[:,:]
df["HEUREUX_CLF"] = 0
df.loc[df["HEUREUX"]==4, "HEUREUX_CLF"] = 1
df.loc[df["HEUREUX"]==3, "HEUREUX_CLF"] = 1
df.loc[df["HEUREUX"]==5, "HEUREUX_CLF"] = None

# treating remaining missing values
features = set(df.columns.drop(['HEUREUX', 'HEUREUX_CLF']))
df = df.loc[:,features | {"HEUREUX_CLF"}].dropna()

In [76]:
X = df.loc[:,features]
y = df["HEUREUX_CLF"]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42
                                                   )

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(f"Number exemple: {y.shape[0]}\n- training set: \
{y_train.shape[0]}\n- test set: {y_test.shape[0]}")
print(f"Number of features: p={X_train.shape[1]}")
print(f"Number of class: {len(np.unique(y))}")
for c in np.unique(y):
    print(f"class {c:0.0f} : {100*np.sum(y==c)/len(y):0.1f}%")

Number exemple: 10445
- training set: 8356
- test set: 2089
Number of features: p=1023
Number of class: 2
class 0 : 35.1%
class 1 : 64.9%


In [77]:
clf = LinearSVC(C=0.01, 
                class_weight='balanced',
                dual=False,
                random_state=42 )
step = 0.05

In [78]:
for n_features_to_select in [100,50,20,10]:
    startTime = time.time()
    print(f"number of features to select : {n_features_to_select}")
    selector = RFE(estimator=clf, n_features_to_select=n_features_to_select, step=step)
    selector.fit(X_train, y_train)
    print(f"Optimal support of size {n_features_to_select} found in {time.time() - startTime:0.1f} s")
    key = "RFE_LinearSVC_" + str(n_features_to_select) + "_features" 
    dict_features_sets[key] = set(X.loc[:,selector.support_].columns)

number of features to select : 100
Optimal support of size 100 found in 168.4 s
number of features to select : 50
Optimal support of size 50 found in 213.5 s
number of features to select : 20
Optimal support of size 20 found in 195.1 s
number of features to select : 10
Optimal support of size 10 found in 217.7 s


In [79]:
params = {'max_features' :'sqrt', 'random_state' : 32, 
          'min_samples_split' : 2, 'class_weight' : 'balanced',
          'n_estimators' : 128,
          'max_depth' : 8}
clf = RandomForestClassifier(**params)
step = 0.05

In [80]:
for n_features_to_select in [100,50,20,10]:
    startTime = time.time()
    print(f"number of features to select : {n_features_to_select}")
    selector = RFE(estimator=clf, n_features_to_select=n_features_to_select, step=step)
    selector.fit(X_train, y_train)
    print(f"Optimal support of size {n_features_to_select} found in {time.time() - startTime:0.1f} s")
    key = "RFE_RandomForestClassifier_" + str(n_features_to_select) + "_features" 
    dict_features_sets[key] = set(X.loc[:,selector.support_].columns)

number of features to select : 100
Optimal support of size 100 found in 89.9 s
number of features to select : 50
Optimal support of size 50 found in 79.3 s
number of features to select : 20
Optimal support of size 20 found in 89.6 s
number of features to select : 10
Optimal support of size 10 found in 83.0 s


In [81]:
clf = LogisticRegression(C=0.01, 
                         penalty='l1', 
                         class_weight='balanced',
                         random_state=42)
step = 0.05

In [82]:
for n_features_to_select in [100,50,20,10]:
    startTime = time.time()
    print(f"number of features to select : {n_features_to_select}")
    selector = RFE(estimator=clf, n_features_to_select=n_features_to_select, step=step)
    selector.fit(X_train, y_train)
    print(f"Optimal support of size {n_features_to_select} found in {time.time() - startTime:0.1f} s")
    key = "RFE_LogisticRegression_" + str(n_features_to_select) + "_features" 
    dict_features_sets[key] = set(X.loc[:,selector.support_].columns)

number of features to select : 100
Optimal support of size 100 found in 11.2 s
number of features to select : 50
Optimal support of size 50 found in 12.3 s
number of features to select : 20
Optimal support of size 20 found in 8.9 s
number of features to select : 10
Optimal support of size 10 found in 8.7 s


#### SelectFromModel

In [83]:
clf = LinearSVC(C=0.01, penalty="l1", dual=False, class_weight='balanced' ).fit(X_train, y_train)
model = SelectFromModel(clf, prefit=True)
dict_features_sets['SelectFromModel_LinearSCV_features'] = set(X.loc[:,model.get_support()].columns)

In [84]:
clf = LogisticRegression(C=0.01, penalty="l1",class_weight='balanced',random_state=42 ).fit(X_train, y_train)
model = SelectFromModel(clf, prefit=True)
dict_features_sets['SelectFromModel_LogisticRegression_features'] = set(X.loc[:,model.get_support()].columns)

In [85]:
filename = path_dump / Path("dict_features_sets.sav")
with open(filename, 'wb') as fp:
     pickle.dump(dict_features_sets,fp,pickle.HIGHEST_PROTOCOL)

In [86]:
[k for k in dict_features_sets.keys()]

['all_features',
 'cdv_features',
 'additional_features',
 'insee_features',
 'insee_demographics_features',
 'insee_recreation_features',
 'cat_features',
 'quant_features',
 'non_redundant_cdv_features',
 'cdv_actionable_individual_1_features',
 'cdv_actionable_individual_2_features',
 'cdv_actionable_individual_3_features',
 'cdv_actionable_individual_4_features',
 'cdv_actionable_admin_1_features',
 'cdv_actionable_admin_2_features',
 'cdv_actionable_admin_3_features',
 'cdv_actionable_admin_4_features',
 'cdv_actionable_admin_5_features',
 'insee_recreation_actionable_admin_1_features',
 'insee_recreation_actionable_admin_3_features',
 'insee_recreation_actionable_admin_4_features',
 'insee_demographics_actionable_admin_1_features',
 'insee_demographics_actionable_admin_2_features',
 'insee_demographics_actionable_admin_3_features',
 'insee_demographics_actionable_admin_4_features',
 'insee_demographics_actionable_admin_5_features',
 'insee_environment_score_features',
 'insee_rec