In [24]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, accuracy_score

import tensorflow as tf
import tensorflow_addons as tfa

In [25]:
data_base = pd.read_excel('./big_merge_V2_meteo_SAT.xlsx').drop('Unnamed: 0', axis=1)

In [26]:
data_base['TAUX_COUV_RAJ'] = data_base['TAUX_COUV_RAJ'].apply(lambda v : v if v!=-1 else 3)

In [27]:
data_base['TAUX_COUV_RAJ'].value_counts()

3    4571
2    2313
4    1337
5     629
1     417
6     345
Name: TAUX_COUV_RAJ, dtype: int64

PREPROCESSING _ Code base for models temporal predictions

Ici, features engineering (création de nouvelles features à partir de la liste connues):

In [28]:
# adding aridity index
data_base["AI"] = data_base['PRCP_GROWTH'] / data_base['TAVE_GROWTH']
# adding H/D index
data_base["H_D"] = data_base['HAUTEUR_ARBRE'] / data_base['DBH']

### Choix TARGET et Features :

Target :

In [29]:
TARGET = ['TAUX_COUV_RAJ'] #exemple

Features :

In [131]:
# --- PAST --- 
# Attention : on peut, logiquement, inclure la TARGET... (connue dans le passé)
cat_strict = ['PRODREG', 'ESPECE_DOM', 'TYP_RAJ_PPL', 'DEG_FERMETURE', 'STR_PPL', 'RELIEF', 'ORIENTATION'] #exemple
cat_ord_miss = ['TAUX_COUV_RAJ', 'HT_VEG'] #exemple 'TAILLE_PPL', 'MELANGE', 'QUAL_STATION', 'SURF_TROU_AER'
numerics = ['H_D','AI','SDI', 'AGE_PPL','ALT', 'TIGES_VIV_H', 'SURF_TER_HA', 'FEUILL_PER', 'CONIF_PER', 'PERF_CROI', 'NDVI', 'EVI', 'NDMI', 'NDWI', 'DSWI'] #exemple

# --- FUTURE KNOWN ---
# Attention : logiquement les features, potentiellement connues dans le futur ci-dessous sont aussi répertoriées ci-dessus dans le passé
add_cat_IFN_stable = ['DEG_FERMETURE', 'STR_PPL'] # technos-substituts : 'DEG_FERMETURE', 'STR_PPL'
add_cat_ord_IFN_stable = [] # technos-substituts : 'DEG_FERMETURE', 'STR_PPL', 'SURF_TROU_AER' 'TAILLE_PPL', 'SURF_TROU_AER'
add_IFN_numerics_stable = ['SDI', 'H_D', 'TIGES_VIV_H'] # technos-substituts : , 'SDI'
# données potentiellement connues car stables par parcelles :
#['LAT', 'LON', 'ALT', 'PRODREG', 'HT_VEG', 'SLOPE25', 'ASPECT25', 'ORIENTATION', 'PERF_CROI', 'QUAL_STATION', 'UNIT_VEG_FINE',
# 'UNIT_VEG_GROS', 'PROCESS_SILVA', 'PERI_CHENAUX', 'PERI_COULEES','PERI_AVALANCH', 'PERI_CHUTES', 'ETAGE', 'ENDOMMAGEMENT','NB_DEGAT_ARBRE']
# + données extrapolées grâce à des technologies prometteuses (satellites ?)....

add_meteo_known = ['PRCP', 'TAVE_AVG',	'TAVE', 'TAVE_GROWTH', 'PRCP_S_S',	'PRCP_G_S', 'AI']

add_SAT_known = ['NDVI', 'EVI', 'NDMI', 'NDWI', 'DSWI']

In [132]:
features_past = numerics + cat_strict + cat_ord_miss
features_future = add_cat_IFN_stable + add_cat_ord_IFN_stable + add_IFN_numerics_stable + add_meteo_known + add_SAT_known

In [133]:
data_LFI1 = data_base.loc[data_base['LFI']=='LFI1',:].sort_values('PARCELLE')
data_LFI2 = data_base.loc[data_base['LFI']=='LFI2',:].sort_values('PARCELLE')
data_LFI3 = data_base.loc[data_base['LFI']=='LFI3',:].sort_values('PARCELLE')
data_LFI4 = data_base.loc[data_base['LFI']=='LFI4',:].sort_values('PARCELLE')

In [134]:
r2_score(data_LFI4['TAUX_COUV_RAJ'], data_LFI3['TAUX_COUV_RAJ'])

0.29646283832163833

In [135]:
accuracy_score(data_LFI4['TAUX_COUV_RAJ'], data_LFI3['TAUX_COUV_RAJ'])

0.4540158135663754

In [136]:
future_feat_names = []
add_cat_strict_feat_names = []
add_cat_ord_feat_names = []
add_numerics_feat_names = []

for cat in features_future:
    lfi2_list = data_LFI2[cat].to_list()
    lfi3_list = data_LFI3[cat].to_list()
    lfi4_list = data_LFI4[cat].to_list()
    data_LFI1[cat + "_f"] = lfi2_list
    data_LFI2[cat + "_f"] = lfi2_list
    data_LFI3[cat + "_f"] = lfi2_list
    future_feat_names.append(cat + '_f')
    if cat in add_cat_ord_IFN_stable:
        add_cat_ord_feat_names.append(cat + '_f')
    elif cat in add_cat_IFN_stable:
        add_cat_strict_feat_names.append(cat + '_f')
    else:
        add_numerics_feat_names.append(cat + '_f')


In [137]:
data_red = pd.concat([data_LFI1, data_LFI2, data_LFI3], axis=0)[features_past + future_feat_names]
Y = pd.concat([data_LFI2[TARGET], data_LFI3[TARGET], data_LFI4[TARGET]], axis = 0)

In [138]:
data_red.head()

Unnamed: 0,H_D,AI,SDI,AGE_PPL,ALT,TIGES_VIV_H,SURF_TER_HA,FEUILL_PER,CONIF_PER,PERF_CROI,...,TAVE_f,TAVE_GROWTH_f,PRCP_S_S_f,PRCP_G_S_f,AI_f,NDVI_f,EVI_f,NDMI_f,NDWI_f,DSWI_f
0,0.695652,,571,85.0,715.91897,590.0,27.79,,,4676,...,9.026806,13.84776,117.278617,51.835496,41.138213,0.4992,0.016,0.2705,-0.4653,0.6865
5,0.647727,,890,140.0,563.829759,400.0,53.38,,,3402,...,19.79349,13.83959,122.675652,55.827235,46.389611,0.5552,0.0149,0.2017,-0.4864,0.6996
9,0.666667,,489,80.0,564.885846,320.0,26.7,,,5617,...,11.67138,13.39843,128.067334,61.309991,50.289235,0.5633,0.0153,0.1727,-0.5002,0.6852
14,,,0,2.0,563.551602,0.0,0.0,,,4258,...,10.45987,13.1553,130.759873,63.219754,52.814083,0.6059,0.0197,0.2269,-0.5588,0.8384
19,0.676471,,377,130.0,539.769096,150.0,23.32,,,4330,...,11.54484,13.16893,131.143585,62.702111,52.327425,0.6012,0.0184,0.2029,-0.5486,0.7902


In [139]:
Y = Y.to_numpy().ravel()

Traitement des données catégorielles ordonnées en numériques (gestion des "-1" éventuels) :

In [140]:
feats_cat_ord = cat_ord_miss + add_cat_ord_feat_names
for cat in feats_cat_ord:
  data_red[cat] = data_red[cat].apply(lambda v : v if v!=-1 else np.nan)

LISTE DES FEATURES NUMERIQUES ET CATEGORIELLES EN VUE DU PREPROCESSING DE MODELE :

In [141]:
numerics_features = numerics + feats_cat_ord + add_numerics_feat_names
feats_cat_strict = cat_strict + add_cat_strict_feat_names

In [142]:
feats_cat_strict

['PRODREG',
 'ESPECE_DOM',
 'TYP_RAJ_PPL',
 'DEG_FERMETURE',
 'STR_PPL',
 'RELIEF',
 'ORIENTATION',
 'DEG_FERMETURE_f',
 'STR_PPL_f']

In [143]:
numerics_transforms = Pipeline(
    [('imputer',KNNImputer()),
    ('encoder',StandardScaler())
])
categorials_transforms = Pipeline([
    ('imputer',KNNImputer()),
    ('encoder',OneHotEncoder(drop="first"))
])

preprocessor = ColumnTransformer(
    [("num", numerics_transforms, numerics_features),
     ("cat", categorials_transforms, feats_cat_strict)])

In [144]:
data_red['ORIENTATION'] = data_red['ORIENTATION'].map({'N':0,'NE':1,'E':2,'SE':3,'S':4,'SO':5,'O':6,'NO':7})
#data_red['ORIENTATION_f'] = data_red['ORIENTATION_f'].map({'N':0,'NE':1,'E':2,'SE':3,'S':4,'SO':5,'O':6,'NO':7})


In [145]:
X_train, X_test, y_train, y_test = train_test_split(data_red, Y, test_size=0.2, random_state=2, stratify=Y)

In [146]:
np.shape(X_train)

(5767, 41)

In [147]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [148]:
np.shape(X_train)

(5767, 90)

In [149]:
np.shape(X_test)

(1442, 90)

In [150]:
model =  RidgeClassifier(max_iter=10000)

In [151]:
params = {
    'alpha':[18, 26, 36]
}

In [152]:
grid = GridSearchCV(model, param_grid=params, scoring='accuracy', verbose=2)

In [153]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ...........................................alpha=18; total time=   0.0s
[CV] END ...........................................alpha=18; total time=   0.0s
[CV] END ...........................................alpha=18; total time=   0.0s
[CV] END ...........................................alpha=18; total time=   0.0s
[CV] END ...........................................alpha=18; total time=   0.0s
[CV] END ...........................................alpha=26; total time=   0.0s
[CV] END ...........................................alpha=26; total time=   0.0s
[CV] END ...........................................alpha=26; total time=   0.0s
[CV] END ...........................................alpha=26; total time=   0.0s
[CV] END ...........................................alpha=26; total time=   0.0s
[CV] END ...........................................alpha=36; total time=   0.0s
[CV] END ........................................

GridSearchCV(estimator=RidgeClassifier(max_iter=10000),
             param_grid={'alpha': [18, 26, 36]}, scoring='accuracy', verbose=2)

In [154]:
grid.best_estimator_

RidgeClassifier(alpha=26, max_iter=10000)

In [155]:
train_scores = cross_val_score(grid.best_estimator_, X_train, y_train, cv=5)
test_scores = cross_val_score(grid.best_estimator_, X_test, y_test, cv=5)
print(f'Train score mean : {np.mean(train_scores)}')
print(f'Train score std : {np.std(train_scores)}')
print(f'Test score mean : {np.mean(test_scores)}')
print(f'Test score std : {np.std(test_scores)}')

Train score mean : 0.4317667271423654
Train score std : 0.010780147165774719
Test score mean : 0.39599913494809685
Test score std : 0.027566349970251182


In [156]:
list_features_in = []
for feat in numerics_features:
  list_features_in.append(feat)
for cat in feats_cat_strict:
  nb_lab = len(data_red[cat].unique())-1
  for i in range(nb_lab):
    list_features_in.append(f'{cat}_{i}')

In [157]:
df_coef_inter = pd.DataFrame(grid.best_estimator_.coef_)

In [158]:
len(abs(df_coef_inter).sum())

90

In [159]:
df_coef = pd.DataFrame(abs(df_coef_inter).sum(), columns=['Coeff'])

In [160]:
df_coef

Unnamed: 0,Coeff
0,0.058299
1,0.112665
2,0.518108
3,0.228557
4,0.170451
...,...
85,0.368844
86,0.696961
87,0.388909
88,0.313900


In [161]:
len(list_features_in)

90

In [162]:
df_coef['Features'] = list_features_in

In [163]:
df_coef = df_coef.set_index('Features')

In [164]:
fig = px.bar(df_coef['Coeff'], title=f"Features importance for target : {TARGET} with Lasso Linear Regression")
fig.show()

-----------------

In [165]:
y_train

array([4, 2, 3, ..., 1, 4, 2], dtype=int64)

In [166]:
processor_y = OneHotEncoder(sparse=False)
y_train = processor_y.fit_transform(y_train.reshape(-1, 1))
y_test = processor_y.transform(y_test.reshape(-1, 1))

In [167]:
train_batch = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=len(X_train)).batch(batch_size=32)
test_batch = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size=32)

In [168]:
for x, y in train_batch.take(1):
    print(x)
    print(y)

tf.Tensor(
[[-1.0610367   0.96361502 -1.1069141  ...  0.          0.
   0.        ]
 [-0.59457642 -1.00768412  2.20968862 ...  1.          0.
   0.        ]
 [-0.35231801 -1.67497877 -0.50688067 ...  1.          0.
   0.        ]
 ...
 [-0.92109862 -0.85050972 -0.71883278 ...  0.          1.
   0.        ]
 [-0.19963297  1.15108452 -0.88600627 ...  1.          0.
   0.        ]
 [ 0.47828225 -0.79277947  0.87827115 ...  1.          0.
   0.        ]], shape=(32, 90), dtype=float64)
tf.Tensor(
[[0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.

In [172]:
model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(90,)),
        tf.keras.layers.Dense(64,"relu"),
        tf.keras.layers.Dense(32,"relu"),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(16,"relu"),
        tf.keras.layers.Dense(6,"softmax")
    ])

In [173]:
model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=tf.keras.metrics.CategoricalAccuracy())

In [174]:
model.fit(train_batch, epochs=20, validation_data=test_batch)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x20798817fa0>

In [177]:
coeff_mean = []
for i in range(90):
    coeff_mean.append(np.mean(model.layers[0].trainable_variables[0][i]))

In [178]:
df_coef = pd.DataFrame(coeff_mean, columns=['Coeff'], index= list_features_in)

In [179]:
fig = px.bar(df_coef['Coeff'], title=f"Features importance for target : {TARGET} with Lasso Linear Regression")
fig.show()