In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
import time
import matplotlib.pyplot as plt
%matplotlib inline
#%pylab inline
import itertools
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV, RFE
from sklearn.utils import resample

In [2]:
path_project = Path.home() / Path('Google Drive/Felix')
path_data = path_project / Path("data")
path_dump = path_project / Path("dump")

In [3]:
# loading cdv data
file = path_data / Path("felix.csv")
with Path.open(file, 'rb') as fp:
    cdv = pd.read_csv(fp,  encoding='cp1252',low_memory=False, index_col = 0)
# loadind cdv data without format
file = path_data / Path("felix_ssfmt.csv")
with Path.open(file, 'rb') as fp:
    cdv_ssfmt = pd.read_csv(fp,  encoding='cp1252',low_memory=False, index_col = 0)
    # loading MergeCommunesEnvi data
file = path_data / Path("MergeCommunesEnvi.csv")
with Path.open(file, 'rb') as fp:
    MergeCommunesEnvi = pd.read_csv(fp,  encoding='cp1252',low_memory=False, sep=';', index_col = 1)

In [44]:
# load various variable set
filename = path_dump / Path("dict_var_groups.sav")
with open(filename, 'rb') as fp:
     dict_var_groups = pickle.load(fp)

usual_common_scope = dict_var_groups['usual_common_scope']

cat_var = dict_var_groups['cat_var']
cat_max9_var = dict_var_groups['cat_max9_var']
quant_var = dict_var_groups['quant_var']

## features scopes

In [30]:
df = MergeCommunesEnvi.loc[:,:]
df.loc[:,cdv_ssfmt.columns] = cdv_ssfmt.loc[:,:]
df = df.loc[:,usual_common_scope]
df.loc[:,(cat_var & usual_common_scope) - {"HEUREUX"}] = cdv.loc[:,(cat_var & usual_common_scope) - {"HEUREUX"}]


df_dummies = pd.get_dummies(
    df, 
    columns=(cat_var & usual_common_scope) - {"HEUREUX"},
    dummy_na = True,
    drop_first=1
)

print(f"{df_dummies.shape[1]} columns after encoding of {len((cat_var & usual_common_scope))-1} categorial \
variables in {len((cat_var & usual_common_scope))-1+df_dummies.shape[1]-df.shape[1]} binary variables \
(K-1 one hot encoding)")

836 columns after encoding of 155 categorial variables in 572 binary variables (K-1 one hot encoding)


In [31]:
dict_features_sets = dict()

In [32]:
usual_common_features  = set(df_dummies.columns)
dict_features_sets['usual_common_features'] = usual_common_features

In [33]:
indiv_semi_act_var = dict_var_groups["indiv_semi_act_var"] 
indiv_act_var = dict_var_groups["indiv_act_var"] 
admin_semi_act_var = dict_var_groups["admin_semi_act_var"] 
admin_act_var = dict_var_groups["admin_act_var"] 

In [49]:
scope = (indiv_act_var) & usual_common_scope
scope

{'ASSOAUTR',
 'ASSOCONF',
 'ASSOCONS',
 'ASSOCULT',
 'ASSOENVI',
 'ASSOHUMA',
 'ASSOJEUN',
 'ASSOPARE',
 'ASSOPOLI',
 'ASSOSPOR',
 'ASSOSYND',
 'FREQBIBL',
 'FREQCINE',
 'FREQSPOR',
 'FREQTELE',
 'NOT_AMIS',
 'NOT_COHE',
 'NOT_FAMI',
 'NOT_LIBR',
 'NOT_POLI',
 'NOT_PROF',
 'RELIGION',
 'VACANCES'}

In [50]:
df = MergeCommunesEnvi.loc[:,:]
df.loc[:,cdv_ssfmt.columns] = cdv_ssfmt.loc[:,:]
df = df.loc[:,scope]
df.loc[:,(cat_var & scope) - {"HEUREUX"}] = cdv.loc[:,(cat_var & scope) - {"HEUREUX"}]


df_dummies = pd.get_dummies(
    df, 
    columns=(cat_var & scope),
    dummy_na = True,
    drop_first=1
)

print(f"{df_dummies.shape[1]} columns after encoding of {len((cat_var & scope))} categorial \
variables in {len((cat_var & scope))+df_dummies.shape[1]-df.shape[1]} binary variables \
(K-1 one hot encoding)")

50 columns after encoding of 13 categorial variables in 40 binary variables (K-1 one hot encoding)


In [51]:
indiv_act_features = set(df_dummies.columns)
dict_features_sets['indiv_act_features'] = indiv_act_features

## scope correspondong to features selection

In [61]:
df = MergeCommunesEnvi.loc[:,:]
df.loc[:,cdv_ssfmt.columns] = cdv_ssfmt.loc[:,:]
df = df.loc[:,usual_common_scope]
df.loc[:,(cat_var & usual_common_scope) - {"HEUREUX"}] = cdv.loc[:,(cat_var & usual_common_scope) - {"HEUREUX"}]


df = pd.get_dummies(
    df, 
    columns=(cat_var & usual_common_scope) - {"HEUREUX"},
    dummy_na = True,
    drop_first=1
)

In [62]:
df.shape

(11131, 836)

In [63]:
# encoding of "HEUREUX" '[nsp]'
#df.loc[df["HEUREUX"]==5,"HEUREUX"]= None
#df = df.loc[np.isfinite(df['HEUREUX']).index,:]

# reducing problem to a 2 class classification problem
df["HEUREUX_CLF"] = 0
df.loc[df["HEUREUX"]==4, "HEUREUX_CLF"] = 1
df.loc[df["HEUREUX"]==3, "HEUREUX_CLF"] = 1
df.loc[df["HEUREUX"]==5, "HEUREUX_CLF"] = None

# treating remaining missing values
features = set(df.columns.drop(['HEUREUX', 'HEUREUX_CLF']))
df_tmp = df.loc[:,features | {"HEUREUX_CLF"}].dropna()

X = df_tmp.loc[:,features]
y = df_tmp["HEUREUX_CLF"]

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42
                                                   )

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(f"Number exemple: {y.shape[0]}\n- training set: \
{y_train.shape[0]}\n- test set: {y_test.shape[0]}")
print(f"Number of features: p={X_train.shape[1]}")
print(f"Number of class: {len(np.unique(y))}")
for c in np.unique(y):
    print(f"class {c:0.0f} : {100*np.sum(y==c)/len(y):0.1f}%")

Number exemple: 10445
- training set: 8356
- test set: 2089
Number of features: p=835
Number of class: 2
class 0 : 35.1%
class 1 : 64.9%


In [65]:
startTime = time.time()
n_features_to_select = 20
step = 0.05
clf = LogisticRegression(C=1, 
                         penalty='l1', 
                         class_weight='balanced',
                         random_state=42)
selector = RFE(estimator=clf, n_features_to_select=n_features_to_select, step=step)
selector.fit(X_train, y_train)
print(f"Optimal support of size {n_features_to_select} found in {time.time() - startTime:0.1f} s")

Optimal support of size 20 found in 841.0 s


In [73]:
lasso_20_features = X.loc[:,selector.support_].columns

In [75]:
lasso_20_features = set(lasso_20_features)

In [76]:
dict_features_sets['lasso_20_features'] = lasso_20_features

In [77]:
filename = path_dump / Path("dict_features_sets.sav")
with open(filename, 'wb') as fp:
     pickle.dump(dict_features_sets,fp,pickle.HIGHEST_PROTOCOL)

In [78]:
# saving dataset data
file = path_data / Path("dataset.csv")
with Path.open(file, 'w') as fp:
    df_tmp.to_csv(fp,  encoding='utf-8')