In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
import time
import matplotlib.pyplot as plt
%matplotlib inline
#%pylab inline
import itertools
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV, RFE
from sklearn.utils import resample

In [2]:
path_project = Path.home() / Path('Google Drive/Felix')
path_data = path_project / Path("data")
path_dump = path_project / Path("dump")

In [3]:
# loading cdv data
file = path_data / Path("felix.csv")
with Path.open(file, 'rb') as fp:
    cdv = pd.read_csv(fp,  encoding='cp1252',low_memory=False, index_col = 0)
# loadind cdv data without format
file = path_data / Path("felix_ssfmt.csv")
with Path.open(file, 'rb') as fp:
    cdv_ssfmt = pd.read_csv(fp,  encoding='cp1252',low_memory=False, index_col = 0)
    # loading MergeCommunesEnvi data
file = path_data / Path("MergeCommunesEnvi.csv")
with Path.open(file, 'rb') as fp:
    MergeCommunesEnvi = pd.read_csv(fp,  encoding='cp1252',low_memory=False, sep=';', index_col = 1)

In [4]:
# load various variable set
filename = path_dump / Path("dict_var_groups.sav")
with open(filename, 'rb') as fp:
     dict_var_groups = pickle.load(fp)

usual_common_scope = dict_var_groups['usual_common_scope']

cat_var = dict_var_groups['cat_var']
cat_max9_var = dict_var_groups['cat_max9_var']
quant_var = dict_var_groups['quant_var']

In [5]:
df = MergeCommunesEnvi.loc[:,:]
df.loc[:,cdv_ssfmt.columns] = cdv_ssfmt.loc[:,:]
df = df.loc[:,usual_common_scope]
df.loc[:,(cat_var & usual_common_scope) - {"HEUREUX"}] = cdv.loc[:,(cat_var & usual_common_scope) - {"HEUREUX"}]
print(f"\nFinal number of variable kept : {df.shape[1]}")


Final number of variable kept : 419


In [6]:
p = df.shape[1]
print(f"{p} columns out of which {len((cat_var & usual_common_scope))-1} \
are corresponding to categorial features")

419 columns out of which 155 are corresponding to categorial features


In [7]:
df = pd.get_dummies(df, 
                    columns=(cat_var & usual_common_scope) - {"HEUREUX"},
                    dummy_na = True,
                    drop_first=1)

q = df.shape[1]
print(f"{q} columns after encoding of {len((cat_var & usual_common_scope))-1} categorial \
variables in {len((cat_var & usual_common_scope))-1+q-p} binary variables \
(K-1 one hot encoding)")

836 columns after encoding of 155 categorial variables in 572 binary variables (K-1 one hot encoding)


In [8]:
# encoding of "HEUREUX" '[nsp]'
df.loc[df["HEUREUX"]==5,"HEUREUX"]= None
df = df.loc[np.isfinite(df['HEUREUX']).index,:]


# treating remaining missing values
features = df.columns.drop(['HEUREUX'])
df_tmp = df.loc[:,set(features) | {"HEUREUX"}].dropna()

X = df_tmp.loc[:,features]
y = df_tmp["HEUREUX"]

In [9]:
def balanced_subsample(x,y,subsample_size=1.0):

    class_xs = []
    min_elems = None

    for yi in np.unique(y):
        elems = x[(y == yi)]
        class_xs.append((yi, elems))
        if min_elems == None or elems.shape[0] < min_elems:
            min_elems = elems.shape[0]

    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems*subsample_size)

    xs = []
    ys = []

    for ci,this_xs in class_xs:
        if len(this_xs) > use_elems:
            this_xs = this_xs.reindex(np.random.permutation(this_xs.index))

        x_ = this_xs[:use_elems]
        y_ = np.empty(use_elems)
        y_.fill(ci)

        xs.append(x_)
        ys.append(y_)
    
    xs = pd.concat(xs)
    ys = pd.Series(data=np.concatenate(ys),name='target')

    return xs,ys

In [None]:
n = y.shape[0]
nb_value = 20 # Nombre de valeurs testées pour l'hyperparamètre
mean_score_l1 = np.zeros(nb_value)
C_log = np.logspace(-2,2,nb_value)
cv = 6 # V-fold, nombre de fold
scoring='f1_macro'

score = np.empty(len(range(10,110,10)))

np.random.seed(seed=42) 

startTime = time.time()


for j, size in enumerate(range(10,110,10)):
    m = int(n*size/100)
    print(f"size :{m}")
    Xs, ys = resample(X, y)
    Xs = Xs.iloc[0:m,:]
    ys = ys.iloc[0:m]
    X_train, X_test, y_train, y_test = train_test_split(Xs, 
                                                    ys, 
                                                    test_size=0.2, 
                                                    random_state=42
                                                   )

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    
    mean_score_l1 = np.empty(nb_value)
    
    for i, C in enumerate(C_log):
        clf = LogisticRegression(C=C, penalty='l1', 
                                 random_state=42, 
                                 class_weight='balanced')
        mean_score_l1[i] = 100*np.mean(1-cross_val_score(clf, 
                                                     X_train, 
                                                     y_train,
                                                     cv=cv, 
                                                     scoring=scoring))

        
    # Learning on full training set with optimals hyperparameters and score on test set
    clf = LogisticRegression(C=C_log[np.argmax(mean_score_l1)], 
                                 penalty='l1', 
                                 random_state=42, 
                                 class_weight='balanced')
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    
    score[i] = f1_score(y_test, y_test_pred, average='macro')

size :1044


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [10]:
len(range(10,110,10))

10

In [None]:
nb_value = 20 # Nombre de valeurs testées pour l'hyperparamètre
mean_score_l1 = np.zeros(nb_value)
C_log = np.logspace(-2,2,nb_value)
cv = 6 # V-fold, nombre de fold

mean_score_l1 = np.empty(nb_value)
std_scores_l1 = np.empty(nb_value)

np.random.seed(seed=42) 

startTime = time.time()

for i, C in enumerate(C_log):
    clf = LogisticRegression(C=C, penalty='l1', 
                             tol=0.01, random_state=42, 
                             class_weight='balanced')
    mean_score_l1[i] = 100*np.mean(1-cross_val_score(clf, 
                                                     X_train, 
                                                     y_train,
                                                     cv=cv, 
                                                     scoring='accuracy'))
    std_scores_l1[i] = 100*np.std(1-cross_val_score(clf, 
                                                    X_train, 
                                                    y_train, 
                                                    cv=cv, 
                                                    scoring='accuracy'))    


    
plt.figure()
plt.semilogx(C_log,mean_score_l1[:],'r',linewidth=2,label='moyenne (l1)')
plt.semilogx(C_log,mean_score_l1[:]-0.5*std_scores_l1[:],
             'r--', label=u'+/-0.5 écart type')
plt.semilogx(C_log,mean_score_l1[:]+0.5*std_scores_l1[:],'r--')


plt.xlabel("Valeur de pénalisation C = 1/lambda")
plt.ylabel(u"Erreur de validation croisée (%)\n(Taux moyen d'erreur de classification)")
plt.title(u"Choix de l'hyperparamètre C\npar validation croisée \
(V-fold avec V = %s)" % (cv)) 
plt.legend(bbox_to_anchor=(1, 1))
plt.grid()
plt.show()
print("Détermination des paramètres optimaux en %0.1f s" % (time.time() - startTime))
print("Pénalisation l1, valeur optimale : C = %0.2f" % (C_log[np.argmin(mean_score_l1)]))

In [None]:
# Learning on full training set with optimals hyperparameters 
# and score evaluation on test set
clf = LogisticRegression(C=C_log[np.argmin(mean_score_l1)], 
                         penalty='l1', 
                         tol=0.01, 
                         random_state=42, 
                         class_weight='balanced')
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
accuracy = clf.score(X_test, y_test)
print(f"Model score\n- Accuracy : {accuracy*100:0.1f} %")
f1 = f1_score(y_test, y_test_pred)
p = precision_score(y_test, y_test_pred)
r = recall_score(y_test, y_test_pred)
print(f"- Precision : {p*100:0.1f} % (Happy # positive class)")
print(f"- Recall : {r*100:0.1f} %")
print(f"- F1 score : {f1*100:0.1f} %")