## Setup

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")
import copy as cp
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

from statsmodels.graphics.gofplots import ProbPlot

from statsmodels.formula.api import ols
from scipy.stats import pearsonr
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

import xgboost as xgb


import statsmodels.api as sm

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from statsmodels.stats.outliers_influence import variance_inflation_factor

from statsmodels.stats.diagnostic import het_white
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
import pylab
import scipy.stats as stats
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")



def corr_plot(df, upper_tri=None, threshold=None):
    
    plt.figure(figsize = (12, 8))
    cmap = sns.diverging_palette(230, 20, as_cmap = True)

    corr = df.corr()
    mask = None

    if threshold != None:
        corr = corr.mask(np.abs(corr) < threshold, np.nan)

    if upper_tri:
        mask = np.tril(np.ones_like(corr, dtype=bool)) # affichera que le triangle supérieur

    sns.heatmap(corr, mask=mask, annot = True, fmt = '.2f', cmap = cmap)
    plt.show()


    return corr

# Function to check VIF
def checking_vif(train):
    vif = pd.DataFrame()
    vif["feature"] = train.columns
    vif["VIF"] = [
        variance_inflation_factor(train.values, i) for i in range(len(train.columns))
    ]
    return vif


In [2]:
df = pd.read_csv("./data/X_train_Hi5.csv")
df = df.sample(n=100_000, random_state=42)
df.head()

Unnamed: 0,row_index,piezo_station_department_code,piezo_station_update_date,piezo_station_investigation_depth,piezo_station_department_name,piezo_station_commune_code_insee,piezo_station_pe_label,piezo_station_bdlisa_codes,piezo_station_altitude,piezo_station_bss_code,...,prelev_longitude_2,prelev_latitude_2,prelev_commune_code_insee_2,prelev_other_volume_sum,insee_%_agri,insee_pop_commune,insee_med_living_level,insee_%_ind,insee_%_const,piezo_groundwater_level_category
0,0,1,Sun Jul 14 13:00:02 CEST 2024,20.0,Ain,1073,PIEZOMETRE - MARAIS DE LAVOURS (CEYZERIEU - BR...,['712AH37'],232.0,07004X0046/D6-20,...,5.698947,45.725106,1454.0,1793055000.0,11.8,992.0,25250,2.9,16.2,High
1,1,1,Sun Jul 14 13:00:02 CEST 2024,35.6,Ain,1363,PIEZOMETRE - GRAVIERE (ST-JEAN-LE-VIEUX - BRGM...,['712GB05'],247.25,06754X0077/F1,...,5.464933,46.210734,1051.0,1085125000.0,0.6,1786.0,24660,44.5,11.0,Very High
2,2,1,Sun Jul 14 13:00:02 CEST 2024,35.22,Ain,1244,PIEZOMETRE - BORD AUTOROUTE (MEXIMIEUX - BRGM ...,['040AJ43'],218.77,06993X0226/MEXI_2,...,5.08506,45.812828,69266.0,381049200.0,0.0,8085.0,24890,8.4,7.8,High
3,3,1,Sun Jul 14 13:00:02 CEST 2024,34.2,Ain,1288,PIEZOMETRE - GRENY (PERON - BRGM 01) - BSH,"['516AA00', '516AF00']",499.85,06533X0070/F2,...,5.802841,46.366049,39286.0,380091100.0,1.5,2838.0,39700,2.4,5.2,Very High
4,4,1,Sun Jul 14 13:00:02 CEST 2024,37.3,Ain,1422,FORAGE - ENCLOS (TOSSIAT - BRGM 01) - BSH,['507AB00'],260.0,06518X0026/P2,...,5.377265,46.080989,1273.0,19666310.0,0.2,1352.0,26180,21.5,9.8,Very Low


## Preprocessing

In [4]:
target = "piezo_groundwater_level_category"
drop_col = []
# On tolère 19% de nan dans les colonnes
MAX_NAN_PERCENT=19
new_df = cp.deepcopy(df)
for col in new_df.columns:
    if new_df[col].isnull().sum() * 100 / len(df) > MAX_NAN_PERCENT:
        new_df.drop(col, axis=1, inplace=True)
        drop_col.append(col)

# Gérer les duplicatas
new_df.drop_duplicates(inplace=True)

# Drop les features indépendantes qui sont ultra corrélées entre elles (id...)
features_to_drop = ["piezo_station_commune_code_insee",
                    "piezo_station_pe_label",
                    "piezo_station_bdlisa_codes",
                    "piezo_station_bss_code",
                    "piezo_station_commune_name",
                    "piezo_station_bss_id",
                    "piezo_bss_code",
                    "piezo_station_update_date",
                    "piezo_qualification",
                    "piezo_continuity_code",
                    "piezo_continuity_name",
                    "piezo_producer_name",
                    "piezo_measure_nature_name",
                    "meteo_name",
                    "hydro_station_code",
                    "hydro_method_code",
                    "hydro_method_label",
                    "insee_med_living_level",
                    "meteo_id",
                    "hydro_qualification_label",
                    "hydro_status_code",
                    "piezo_station_department_name"]

features = new_df.drop(features_to_drop, axis=1)

num_col_features =      ["piezo_station_investigation_depth", 
                        "piezo_station_altitude", 
                        "piezo_station_longitude", 
                        "piezo_station_latitude", 
                        "piezo_producer_code", 
                        "meteo_latitude", 
                        "meteo_longitude", 
                        "meteo_altitude", 
                        "meteo_rain_height", 
                        "meteo_temperature_min", 
                        "meteo_time_tn", 
                        "meteo_temperature_max", 
                        "meteo_time_tx", 
                        "meteo_temperature_avg", 
                        "meteo_temperature_avg_threshold",
                        "meteo_frost_duration", 
                        "meteo_amplitude_tn_tx", 
                        "meteo_temperature_avg_tntm", 
                        "meteo_evapotranspiration_grid", 
                        "distance_piezo_meteo", 
                        "hydro_observation_result_elab", 
                        "hydro_longitude",
                        "hydro_latitude", 
                        "distance_piezo_hydro", 
                        "prelev_other_volume_sum", 
                        "insee_%_agri", 
                        "insee_pop_commune", 
                        "insee_%_ind", 
                        "insee_%_const"]

cat_col_features = ["piezo_station_department_code",  
                    "piezo_obtention_mode", 
                    "piezo_status", 
                    "piezo_measure_nature_code", 
                    "hydro_status_label", 
                    "hydro_qualification_code",  
                    "hydro_hydro_quantity_elab"]


new_features = cp.deepcopy(features)

new_features[num_col_features] = new_features[num_col_features].replace('N/A - division par 0', np.nan)
new_features[num_col_features] = new_features[num_col_features].replace('nan', np.nan)

for col in num_col_features:
    new_features[col].astype("float")


imputer_mean_posterior = IterativeImputer(random_state=42, sample_posterior=True)

imputer_mean_posterior.fit(new_features[num_col_features])
values_imputed = imputer_mean_posterior.transform(new_features[num_col_features])


new_features[num_col_features] = values_imputed

def parse_dates(new_features):
    new_features['DATE_piezo_measurement_date'] = pd.to_datetime(new_features['piezo_measurement_date'])

    new_features['year_piezo_measurement'] = new_features['DATE_piezo_measurement_date'].dt.year
    new_features['month_piezo_measurement'] = new_features['DATE_piezo_measurement_date'].dt.month
    new_features['day_piezo_measurement'] = new_features['DATE_piezo_measurement_date'].dt.day
    new_features = new_features.drop(['piezo_measurement_date'], axis=1)

    new_features['DATE_meteo_date'] = pd.to_datetime(new_features['meteo_date'])

    new_features['year_meteo_date'] = new_features['DATE_meteo_date'].dt.year
    new_features['month_meteo_date'] = new_features['DATE_meteo_date'].dt.month
    new_features['day_meteo_date'] = new_features['DATE_meteo_date'].dt.day
    new_features = new_features.drop(['meteo_date'], axis=1)


    new_features['DATE_hydro_observation_date_elab'] = pd.to_datetime(new_features['hydro_observation_date_elab'])

    new_features['year_hydro_observation_date_elab'] = new_features['DATE_hydro_observation_date_elab'].dt.year
    new_features['month_hydro_observation_date_elab'] = new_features['DATE_hydro_observation_date_elab'].dt.month
    new_features['day_hydro_observation_date_elab'] = new_features['DATE_hydro_observation_date_elab'].dt.day
    new_features = new_features.drop(['hydro_observation_date_elab'], axis=1)
    return new_features

new_features = parse_dates(new_features)
new_features_2 = cp.deepcopy(new_features) # au cas où je fais un truc nul


target_feature = new_features_2[target]

# Ordinal encoding :

ord_cols = ["piezo_obtention_mode", 
            "piezo_status", 
            "piezo_measure_nature_code", 
            "hydro_status_label",
            "hydro_qualification_code",
            "piezo_groundwater_level_category"]

for col in ord_cols:

    temp = new_features_2[col]
    temp = pd.DataFrame(temp)

    ordinal_encoder = OrdinalEncoder()
    ordinal_encoder.fit(temp)
    ordinal_enc = ordinal_encoder.transform(temp)

    new_features_2['ORDINAL_' + col] = ordinal_enc


new_features_2 = new_features_2.drop(ord_cols, axis=1)


# Onehot encoding
onehot_cols = ["piezo_station_department_code", "hydro_hydro_quantity_elab"]
new_features_2 = pd.get_dummies(new_features_2, columns=onehot_cols, drop_first=True)

date_cols = ["DATE_piezo_measurement_date", "DATE_meteo_date", "DATE_hydro_observation_date_elab"]
new_features_2 = new_features_2.drop(date_cols, axis=1)

for col in new_features_2.columns:
    new_features_2[col] = new_features_2[col].astype("float64")

new_features_2 = new_features_2.drop("row_index", axis=1)
new_features_2 = new_features_2.dropna()


['meteo_DRR', 'meteo_temperature_min_ground', 'meteo_temperature_min_50cm', 'meteo_pressure_avg', 'meteo_pression_maxi', 'meteo_wind_speed_avg_10m', 'meteo_wind_max', 'meteo_wind_direction_max_inst', 'meteo_time_wind_max', 'meteo_wind_avg', 'meteo_wind_direction_max_avg', 'meteo_time_wind_avg', 'meteo_wind_speed_avg_2m', 'meteo_wind_max_2m', 'meteo_wind_direction_max_inst_2m', 'meteo_time_wind_max_2m', 'meteo_wind_max_3s', 'meteo_time_wind_max_3s', 'meteo_humidity_min', 'meteo_time_humidity_min', 'meteo_humidity_max', 'meteo_time_humidity_max', 'meteo_humidity_duration_below_40%', 'meteo_humidity_duration_above_80%', 'meteo__pressure_saturation_avg', 'meteo_wetting_duration', 'meteo_humidity_avg', 'meteo_sunshine_duration', 'meteo_radiation', 'meteo_radiation_direct', 'meteo_sunshine_%', 'meteo_radiation_IR', 'meteo_radiation_UV_max', 'meteo_cloudiness', 'meteo_cloudiness_height', 'meteo_if_snow', 'meteo_if_fog', 'meteo_if_thunderstorm', 'meteo_if_sleet', 'meteo_if_hail', 'meteo_if_dew

### Extraire saison

In [7]:
cols = new_features_2.columns
result = [col for col in cols if "month" in col]
new_features_3 = cp.deepcopy(new_features_2)
new_features_3 = new_features_3.drop(new_features_3[~new_features_3["month_meteo_date"].isin([6, 7, 8, 9])].index)

['month_piezo_measurement', 'month_meteo_date', 'month_hydro_observation_date_elab']


## Data manipulation

In [6]:
def get_PCA(X_scaled):
    # PCA pour réduire la dimension

    pca = PCA()
    pca.set_output(transform="pandas")
    comp = pca.fit(X_scaled)

    plt.plot(np.cumsum(comp.explained_variance_ratio_))
    plt.grid()
    plt.xlabel('Number of Principal Components')
    plt.ylabel('Explained Variance')
    sns.despine()

    pca = PCA(n_components=15).fit(X_scaled)    # seuil choisi à partir du plot
    X_transformed = pca.transform(X_scaled)

    X_transformed = pd.DataFrame(X_transformed)

    return X_transformed



In [10]:

from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

new_features_3 = new_features_3.drop(target, axis=1)
ord_target = "ORDINAL_" + target
Y = new_features_3[ord_target]
X = new_features_3.drop(columns=ord_target)


transformer = RobustScaler()
transformer.set_output(transform="pandas")
transformer.fit(X)
X_scaled = transformer.transform(X)


X_scaled = sm.add_constant(X_scaled) 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size = 0.20, random_state=42)

## Feature Engineering & Selection

### VIF

In [140]:
vif = checking_vif(X_train)
vif

Unnamed: 0,feature,VIF
0,const,317.523070
1,piezo_station_investigation_depth,1.349110
2,piezo_station_altitude,4.482999
3,piezo_station_longitude,1753.161725
4,piezo_station_latitude,1367.954093
...,...,...
133,piezo_station_department_code_90,1.437472
134,piezo_station_department_code_91,2.978994
135,piezo_station_department_code_93,1.481055
136,piezo_station_department_code_95,1.824318


On vire les features VIF > 5

In [141]:
vif[vif["VIF"] < 5]

Unnamed: 0,feature,VIF
1,piezo_station_investigation_depth,1.349110
2,piezo_station_altitude,4.482999
5,piezo_producer_code,3.891758
8,meteo_altitude,4.581016
9,meteo_rain_height,1.027756
...,...,...
133,piezo_station_department_code_90,1.437472
134,piezo_station_department_code_91,2.978994
135,piezo_station_department_code_93,1.481055
136,piezo_station_department_code_95,1.824318


In [142]:
features_vif = vif[vif["VIF"] < 5]["feature"].to_numpy()

X_train_2 = X_train[features_vif]
X_test_2 = X_test[features_vif]
X_train_2

Unnamed: 0,piezo_station_investigation_depth,piezo_station_altitude,piezo_producer_code,meteo_altitude,meteo_rain_height,meteo_temperature_min,meteo_time_tn,meteo_temperature_max,meteo_evapotranspiration_grid,hydro_observation_result_elab,distance_piezo_hydro,prelev_other_volume_sum,insee_%_agri,insee_pop_commune,insee_%_ind,insee_%_const,ORDINAL_piezo_obtention_mode,ORDINAL_piezo_status,ORDINAL_piezo_measure_nature_code,ORDINAL_hydro_status_label,ORDINAL_hydro_qualification_code,piezo_station_department_code_03,piezo_station_department_code_04,piezo_station_department_code_05,piezo_station_department_code_06,piezo_station_department_code_07,piezo_station_department_code_08,piezo_station_department_code_09,piezo_station_department_code_10,piezo_station_department_code_12,piezo_station_department_code_13,piezo_station_department_code_15,piezo_station_department_code_18,piezo_station_department_code_19,piezo_station_department_code_21,piezo_station_department_code_22,piezo_station_department_code_23,piezo_station_department_code_25,piezo_station_department_code_30,piezo_station_department_code_31,piezo_station_department_code_32,piezo_station_department_code_38,piezo_station_department_code_39,piezo_station_department_code_42,piezo_station_department_code_43,piezo_station_department_code_46,piezo_station_department_code_48,piezo_station_department_code_51,piezo_station_department_code_52,piezo_station_department_code_53,piezo_station_department_code_54,piezo_station_department_code_55,piezo_station_department_code_58,piezo_station_department_code_61,piezo_station_department_code_63,piezo_station_department_code_64,piezo_station_department_code_65,piezo_station_department_code_69,piezo_station_department_code_70,piezo_station_department_code_71,piezo_station_department_code_73,piezo_station_department_code_74,piezo_station_department_code_75,piezo_station_department_code_78,piezo_station_department_code_81,piezo_station_department_code_82,piezo_station_department_code_83,piezo_station_department_code_84,piezo_station_department_code_87,piezo_station_department_code_88,piezo_station_department_code_89,piezo_station_department_code_90,piezo_station_department_code_91,piezo_station_department_code_93,piezo_station_department_code_95,hydro_hydro_quantity_elab_QmM
8573,-0.170629,1.173611,-1.120690e-01,1.10000,0.250000,-3.510638e-01,3.363071e+00,-0.707547,-0.555556,-0.205681,-0.333969,0.608706,0.250000,0.247380,-0.176471,1.614604,0.0,0.0,1.000000,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2766621,-0.457343,-0.493958,2.909483e-02,-0.50000,4.062500,2.659574e-01,-1.369295e-01,1.160377,0.703704,-0.195251,-0.032261,2.561342,1.612069,-0.223115,-0.390374,0.308316,0.0,0.0,1.000000,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2773464,0.030769,0.159722,-6.465517e-03,2.98125,1.125000,3.085106e-01,-8.589212e-01,0.339623,0.518519,-0.205264,-0.951988,38.476701,0.931034,-0.342551,-0.390374,-0.300203,0.0,0.0,0.000000,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
970406,0.618182,-0.348681,2.909483e-02,-0.57500,5.625000,-2.446809e-01,-1.068465e+00,-0.566038,-0.481481,0.845392,0.233358,0.667137,-0.301724,0.380997,1.331551,0.081136,0.0,0.0,0.000000,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
999664,3.544484,1.927083,-1.088362e-01,2.60625,13.750000,2.127660e-02,2.518303e+00,-0.613208,-0.617248,0.812850,-0.575779,1.011836,-0.301724,1.273458,0.010695,-0.389452,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2677297,0.827972,0.055556,-4.310345e-03,0.78750,0.000000,1.063830e-01,-1.035270e+00,0.424528,0.555556,-0.195807,0.293517,35.357218,-0.137931,9.319783,0.144385,-0.113590,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1167669,-0.346853,-0.514722,-3.771552e-02,-0.34375,0.000000,-3.404255e-01,-2.033195e-01,-0.783019,-0.037037,0.232521,0.419289,-0.082162,0.318966,0.122272,-0.299465,0.973631,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198307,2.338462,9.826389,-6.465517e-03,12.57500,2.430956,2.115885e+07,-5.678365e+07,-2.703204,0.713174,-0.035184,-0.232955,5.104275,-0.301724,0.184354,-0.192513,-0.324544,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
478468,-0.528671,-0.445139,2.413793e+10,0.13750,0.000000,-6.077695e+04,-1.899302e+06,0.106292,-0.012563,0.852901,-0.264598,0.550954,5.612069,-0.253683,-0.390374,-0.430020,0.0,-1.0,0.000000,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Predictive Modeling

### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [17]:
rd_forest = RandomForestClassifier(max_depth=2, random_state=0)
rd_forest.fit(X_train, y_train)
y_preds = rd_forest.predict(X_test)

In [18]:
accuracy_score(y_test, y_preds)

0.2995463478452939

### Multinomial Logisitc Regression

In [199]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(multi_class='multinomial', solver ='newton-cg').fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

accuracy_score(y_test, y_pred)

0.22675026123301986

### XGBoost multiclass classifier

#### Test naïf rapide

In [10]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy_score(y_test, y_pred)

0.6900390435933013

#### GridSearch CV

In [13]:
from scipy.stats import uniform, randint

def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}


search = RandomizedSearchCV(xgb_model, 
                            param_distributions=params, 
                            random_state=42, 
                            n_iter=70, 
                            cv=3, 
                            verbose=1, 
                            n_jobs=2, 
                            return_train_score=True)

search.fit(X_train, y_train)


Fitting 3 folds for each of 70 candidates, totalling 210 fits


In [14]:
report_best_scores(search.cv_results_, 1)

Model with rank: 1
Mean validation score: 0.660 (std: 0.003)
Parameters: {'colsample_bytree': 0.8045997961875188, 'gamma': 0.04808827554571038, 'learning_rate': 0.31215697934688114, 'max_depth': 5, 'n_estimators': 138, 'subsample': 0.9746919954946938}

