In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import copy

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, accuracy_score, f1_score

import tensorflow as tf
import tensorflow_addons as tfa

 The versions of TensorFlow you are currently using is 2.11.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
TARGET = 'TAUX_COUV_RAJ'

In [147]:
data_base = pd.read_excel('./big_merge_V2_meteo_SAT.xlsx').drop('Unnamed: 0', axis=1)

In [148]:
data_base['LFI'] = data_base['LFI'].map({'LFI1' : 1,
                                               'LFI2' : 2,
                                               'LFI3' : 3,
                                               'LFI4' : 4 })

In [149]:
data_base['ORIENTATION'] = data_base['ORIENTATION'].map({'N':0,'NE':1,'E':2,'SE':3,'S':4,'SO':5,'O':6,'NO':7})
#data_red['ORIENTATION_f'] = data_red['ORIENTATION_f'].map({'N':0,'NE':1,'E':2,'SE':3,'S':4,'SO':5,'O':6,'NO':7})


In [150]:
data_base.sort_values(['PARCELLE', 'LFI'], inplace=True)

In [151]:
data_base.loc[(data_base[TARGET]==-1) & (data_base['LFI']==4),:]

Unnamed: 0,PARCELLE,LAT,LON,ALT,PRODREG,HT_VEG,DATE,SLOPE25,ASPECT25,ORIENTATION,...,TAVE_AVG,TAVE,TAVE_GROWTH,PRCP_S_S,PRCP_G_S,NDVI,EVI,NDMI,NDWI,DSWI


PREPROCESSING _ Code base for models temporal predictions

Ici, features engineering (création de nouvelles features à partir de la liste connues):

In [152]:
# adding aridity index
data_base["AI"] = data_base['PRCP_GROWTH'] / data_base['TAVE_GROWTH']
# adding H/D index
data_base["H_D"] = data_base['HAUTEUR_ARBRE'] / data_base['DBH']


In [153]:

# --- PAST ---
cat_strict_past = ['PRODREG', 'ESPECE_DOM', 'TYP_RAJ_PPL', 'ORIENTATION', 'RELIEF'] #exemple 'PRODREG', 'ESPECE_DOM', 'TYP_RAJ_PPL', 'DEG_FERMETURE', 'STR_PPL', 'RELIEF'
cat_ord_past = ['TAUX_COUV_RAJ', 'HT_VEG', 'DEGRAD_PPL', 'NIV_DEV', 'QUAL_STATION'] #exemple 'TAILLE_PPL', 'MELANGE', 'QUAL_STATION', 'TAUX_COUV_RAJ', 'SURF_TROU_AER', 'HT_VEG'
numerics_past = [ 'LFI', 'SLOPE25', '25_GRID_PER', 'UNIT_ACCR','H_D','AI','SDI', 'AGE_PPL','ALT', 'TIGES_VIV_H', 'SURF_TER_HA', 'FEUILL_PER', 'CONIF_PER','PERF_CROI'] #exemple

# --- FUTURE ---
cat_strict_future = ['ORIENTATION']
cat_ord_future = ['QUAL_STATION']
numerics_future = ['LFI', 'ALT', 'SLOPE25', 'PERF_CROI']
add_meteo_known = ['PRCP', 'TAVE_AVG',	'TAVE', 'TAVE_GROWTH', 'PRCP_S_S',	'PRCP_G_S', 'AI']
add_SAT_known = ['NDVI', 'EVI', 'NDMI', 'NDWI', 'DSWI']

In [154]:
feats_past = cat_strict_past + cat_ord_past + numerics_past
feats_future_base = cat_strict_future + cat_ord_future + numerics_future + add_meteo_known + add_SAT_known

In [221]:
feats_future_f_names = []
feats_future_f_ord = []
feats_future_f_num = []
feats_future_f_cat_strict  = []

for cat in feats_future_base:
    feat_list = data_base[cat].to_list()
    data_base[cat + "_f"] = feat_list
    feats_future_f_names.append(cat + '_f')
    if cat in cat_ord_future:
        feats_future_f_ord.append(cat + "_f")
    if cat in cat_strict_future:
        feats_future_f_cat_strict.append(cat + "_f")
    if cat in (numerics_future + add_SAT_known + add_meteo_known):
        feats_future_f_num.append(cat + "_f")

In [222]:
feats_total = feats_past + feats_future_f_names

In [223]:
data_red = data_base[feats_total]

Traitement des données catégorielles ordonnées en numériques (gestion des "-1" éventuels) :

In [224]:
for cat in (cat_ord_past + feats_future_f_ord):
  data_red[cat] = data_red[cat].apply(lambda v : int(v) if v!=-1 else np.nan)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Tentative préalable de missing value par la valeur t+1 pour LFI1, t-1 ou t+1 pour LFI2, t-1 pour LFI3 :

In [225]:
for i in range(len(data_red)):
    for j in range(len(data_red.columns)):
        if i%4==0:
            if np.isnan(data_red.iloc[i,j]) or data_red.iloc[i,j]==np.nan:
                next_value = copy.copy(data_red.iloc[i+1,j])
                data_red.iloc[i,j] = next_value
        elif i%4==1:
            if np.isnan(data_red.iloc[i,j]) or data_red.iloc[i,j]==np.nan:
                if np.isnan(data_red.iloc[i-1,j]) or data_red.iloc[i-1,j]==np.nan:
                    past_value = copy.copy(data_red.iloc[i-1,j])
                    data_red.iloc[i,j] = past_value
                else:
                    next_value = copy.copy(data_red.iloc[i+1,j])
                    data_red.iloc[i,j] = next_value
        elif i%4==2:
            if np.isnan(data_red.iloc[i,j]) or data_red.iloc[i,j]==np.nan:
                past_value = copy.copy(data_red.iloc[i-1,j])
                data_red.iloc[i,j] = past_value

In [226]:
ordinal_cat_tot = cat_ord_past + feats_future_f_ord
categorials_strict_tot = cat_strict_past + feats_future_f_cat_strict

In [227]:
numerics_transforms_past = Pipeline(
    [("imputer", KNNImputer()),
    ('encoder',StandardScaler())
])

numerics_transforms_future = Pipeline(
    [("imputer", SimpleImputer()),
    ('encoder',StandardScaler())
])

categorials_transforms = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(drop="first"))
])

ordinal_cat_transforms = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ('encoder',OrdinalEncoder())
])

preprocessor = ColumnTransformer(
    [("num_past", numerics_transforms_past, numerics_past),
    ('num_future', numerics_transforms_future, feats_future_f_num),
    ("ord_cat", ordinal_cat_transforms, ordinal_cat_tot),
     ("cat_strict", categorials_transforms, categorials_strict_tot)])

In [231]:
X_train, X_test = train_test_split(data_red, test_size=2000, shuffle=False, random_state=2)

In [232]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [233]:
np.shape(X_train)

(7612, 74)

In [234]:
list_features_in = []
list_cat_strict_past = []
list_cat_strict_future = []

for feat in (numerics_past + feats_future_f_num + ordinal_cat_tot):
  list_features_in.append(feat)

for cat in cat_strict_past:
  nb_lab = len(data_red[cat].unique())-1
  for i in range(nb_lab):
    list_features_in.append(f'{cat}_{i}')
    list_cat_strict_past.append(f'{cat}_{i}')
  
for cat in feats_future_f_cat_strict:
  nb_lab = len(data_red[cat].unique())-1
  for i in range(nb_lab):
    list_features_in.append(f'{cat}_{i}')
    list_cat_strict_future.append(f'{cat}_{i}')

In [235]:
df_train = pd.DataFrame(X_train, columns=list_features_in)
df_test = pd.DataFrame(X_test, columns=list_features_in)

In [236]:
len(feats_future_f_num + feats_future_f_ord + list_cat_strict_future)

24

In [237]:
list_feats_future = feats_future_f_num + feats_future_f_ord + list_cat_strict_future

In [239]:
df_train_past = df_train[numerics_past + cat_ord_past + list_cat_strict_past]
df_train_future = df_train[feats_future_f_num + feats_future_f_ord + list_cat_strict_future]

df_test_past = df_test[numerics_past + cat_ord_past + list_cat_strict_past]
df_test_future = df_test[feats_future_f_num + feats_future_f_ord + list_cat_strict_future]

In [243]:
X_train_past = df_train_past.iloc[[i for i in range(len(df_train_past)) if i%4!=3],:].to_numpy()
X_train_past_3D = X_train_past.reshape(len(X_train_past)//3, 3, np.shape(X_train_past)[1])
train_tensor_past = tf.convert_to_tensor(X_train_past_3D)

In [244]:
X_test_past = df_test_past.iloc[[i for i in range(len(df_test_past)) if i%4!=3],:].to_numpy()
X_test_past_3D = X_test_past.reshape(len(X_test_past)//3, 3, np.shape(X_test_past)[1])
test_tensor_past = tf.convert_to_tensor(X_test_past_3D)

In [245]:
X_train_future = df_train_future.iloc[[i for i in range(len(df_train_future)) if i%4==3],:].to_numpy()
train_tensor_future = tf.convert_to_tensor(X_train_future)

In [246]:
X_test_future = df_test_future.iloc[[i for i in range(len(df_test_future)) if i%4==3],:].to_numpy()
test_tensor_future = tf.convert_to_tensor(X_test_future)

In [247]:
targets_train = []
for i in range(len(X_train)//4):
    targets_train.append(df_train.iloc[i*4+3,:][TARGET])
y_train = tf.convert_to_tensor(targets_train)

In [248]:
targets_test = []
for i in range(len(X_test)//4):
    targets_test.append(df_test.iloc[i*4+3,:][TARGET])
y_test = tf.convert_to_tensor(targets_test)

In [249]:
GRU_past = tf.keras.models.Sequential([
        tf.keras.layers.GRU(64, input_shape=(3,np.shape(X_train_past)[1],), return_sequences=True),
        tf.keras.layers.GRU(32, return_sequences=True),
        tf.keras.layers.GRU(16, return_sequences=False),
        tf.keras.layers.Dropout(0.1)
    ])

In [250]:
GRU_past(tf.expand_dims(train_tensor_past[0], axis=0))

<tf.Tensor: shape=(1, 16), dtype=float32, numpy=
array([[-0.01906813,  0.02465295, -0.18239641,  0.10592219,  0.00374953,
         0.0020334 , -0.0029473 ,  0.08160238,  0.16122487,  0.03177267,
         0.15925013,  0.12936552,  0.08152311, -0.01394751,  0.031809  ,
        -0.10060655]], dtype=float32)>

In [251]:
MLD_future = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, input_shape=(np.shape(X_train_future)[1],), activation='relu'),
        tf.keras.layers.Dense(32, 'relu'),
        tf.keras.layers.Dense(16, 'relu'),
        tf.keras.layers.Dropout(0.1)
    ])

In [252]:
MLD_future(tf.expand_dims(train_tensor_future[0], axis=0))

<tf.Tensor: shape=(1, 16), dtype=float32, numpy=
array([[0.        , 0.        , 0.        , 0.6695769 , 0.        ,
        0.29599252, 0.5569536 , 1.6155452 , 0.5588567 , 0.        ,
        0.20568062, 0.        , 0.6619757 , 0.89234275, 0.03911621,
        0.        ]], dtype=float32)>

In [253]:
input_GRU = tf.keras.layers.Input(shape=(3,np.shape(X_train_past)[1],))

output_GRU = GRU_past(input_GRU)

input_MLD = tf.keras.layers.Input(shape=(np.shape(X_train_future)[1],))

output_MLD = MLD_future(input_MLD)

x = tf.keras.layers.Concatenate(axis=1)([output_GRU, output_MLD])

output = tf.keras.layers.Dense(6, 'softmax')(x)

In [254]:
model = tf.keras.models.Model(inputs=[input_GRU, input_MLD], outputs=output)

In [255]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 3, 50)]      0           []                               
                                                                                                  
 input_9 (InputLayer)           [(None, 24)]         0           []                               
                                                                                                  
 sequential_5 (Sequential)      (None, 16)           34080       ['input_8[0][0]']                
                                                                                                  
 sequential_6 (Sequential)      (None, 16)           4208        ['input_9[0][0]']                
                                                                                            

In [256]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=tf.keras.metrics.SparseCategoricalAccuracy())

In [258]:
model.fit(
    x=[train_tensor_past, train_tensor_future],
     y=y_train,
     epochs=5,
     batch_size=128,
     validation_data=([test_tensor_past, test_tensor_future],
     y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1683f6a8760>

In [259]:
coeff_mean_GRU = []
for i in range(np.shape(X_train_past)[1]):
    coeff_mean_GRU.append(np.mean(model.layers[2].trainable_variables[0][i]))

In [260]:
df_coef = pd.DataFrame(coeff_mean_GRU, columns=['Coeff_GRU'], index= (numerics_past + cat_ord_past + list_cat_strict_past) )

In [261]:
fig = px.bar(df_coef['Coeff_GRU'], title=f"Features importance for target : {TARGET} in GRU Layers")
fig.show()

In [262]:
coeff_mean_MLP = []
for i in range(np.shape(X_train_future)[1]):
    coeff_mean_MLP.append(np.mean(model.layers[3].trainable_variables[0][i]))

In [263]:
X_train_future[0]

array([ 1.34164079, -1.3152496 ,  0.02018139,  1.48765271, -2.3138333 ,
        0.87323521, -2.41662254,  0.2591452 ,  0.65870923,  0.76074013,
       -1.35714306, -1.92562728, -2.61876463, -1.12007659,  1.9589205 ,
       -1.59354667,  3.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ])

In [264]:
df_coef_2 = pd.DataFrame(coeff_mean_MLP, columns=['Coeff_MLP'], index= feats_future_f_num + feats_future_f_ord + list_cat_strict_future)

In [265]:
fig = px.bar(df_coef_2['Coeff_MLP'], title=f"Features importance for target : {TARGET} in MLP Dense Layers")
fig.show()