In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import copy

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, accuracy_score, f1_score

import tensorflow as tf
import tensorflow_addons as tfa

 The versions of TensorFlow you are currently using is 2.11.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [4]:
TARGET = 'TAUX_COUV_RAJ'

In [5]:
data_base = pd.read_excel('./big_merge_V2_meteo_SAT.xlsx').drop('Unnamed: 0', axis=1)

In [6]:
data_base['LFI'] = data_base['LFI'].map({'LFI1' : 1,
                                               'LFI2' : 2,
                                               'LFI3' : 3,
                                               'LFI4' : 4 })

In [7]:
data_base.sort_values(['PARCELLE', 'LFI'], inplace=True)

In [8]:
data_base.loc[(data_base[TARGET]==-1) & (data_base['LFI']==1),:]

Unnamed: 0,PARCELLE,LAT,LON,ALT,PRODREG,HT_VEG,DATE,SLOPE25,ASPECT25,ORIENTATION,...,TAVE_AVG,TAVE,TAVE_GROWTH,PRCP_S_S,PRCP_G_S,NDVI,EVI,NDMI,NDWI,DSWI
0,51,47.781623,8.612822,715.918970,1,3,1984-04-10,58.407726,67.342415,E,...,,,,0.0,0.0,0.3779,0.0146,0.2264,-0.3325,0.4705
5,384,47.736528,8.625154,563.829759,1,2,1984-04-09,55.683254,253.354935,O,...,,,,0.0,0.0,0.2861,0.0071,0.3293,-0.2327,0.3518
9,1239,47.668637,9.036432,564.885846,2,2,1985-04-01,43.496788,356.177185,N,...,,,,0.0,0.0,0.6749,0.0219,0.2021,-0.5865,0.8626
14,1419,47.660188,8.996234,563.551602,2,2,1985-03-27,29.557123,15.000126,NE,...,,,,0.0,0.0,0.6092,0.0201,0.1112,-0.5143,0.6521
19,1431,47.659089,9.076087,539.769096,2,2,1985-04-19,53.450974,57.142380,E,...,,,,0.0,0.0,0.7039,0.0245,0.1949,-0.5983,0.8834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9595,164918,45.878794,8.997103,628.697500,5,2,1985-03-13,25.137402,288.773682,O,...,,,,0.0,0.0,0.6280,0.0208,0.1838,-0.5329,0.7604
9598,164922,45.878434,9.022856,653.469928,5,2,1985-03-15,70.188095,202.628372,SO,...,,,,0.0,0.0,0.6796,0.0253,0.2015,-0.5812,0.8639
9602,164999,45.869621,9.009721,685.404591,5,2,1985-03-11,60.882717,192.627838,S,...,,,,0.0,0.0,0.6270,0.0211,0.1165,-0.5465,0.6925
9607,165003,45.869258,9.035470,668.776978,5,2,1985-03-18,57.909958,319.882141,NO,...,,,,0.0,0.0,0.6385,0.0184,0.2485,-0.5291,0.8416


PREPROCESSING _ Code base for models temporal predictions

Ici, features engineering (création de nouvelles features à partir de la liste connues):

In [9]:
# adding aridity index
data_base["AI"] = data_base['PRCP_GROWTH'] / data_base['TAVE_GROWTH']
# adding H/D index
data_base["H_D"] = data_base['HAUTEUR_ARBRE'] / data_base['DBH']


In [10]:
cat_strict = ['PRODREG', 'ESPECE_DOM', 'TYP_RAJ_PPL', 'DEG_FERMETURE', 'STR_PPL', 'RELIEF'] #exemple 'PRODREG', 'ESPECE_DOM', 'TYP_RAJ_PPL', 'DEG_FERMETURE', 'STR_PPL', 'RELIEF'
cat_ord_miss = ['TAUX_COUV_RAJ', 'HT_VEG', 'DEGRAD_PPL'] #exemple 'TAILLE_PPL', 'MELANGE', 'QUAL_STATION', 'TAUX_COUV_RAJ', 'SURF_TROU_AER', 'HT_VEG'
numerics = ['PARCELLE', 'LFI', 'UNIT_ACCR','H_D','AI','SDI', 'AGE_PPL','ALT', 'TIGES_VIV_H', 'SURF_TER_HA', 'FEUILL_PER', 'CONIF_PER','PERF_CROI'] #exemple

add_meteo_known = ['PRCP', 'TAVE_AVG',	'TAVE', 'TAVE_GROWTH', 'PRCP_S_S',	'PRCP_G_S']

add_SAT_known = ['NDVI', 'EVI', 'NDMI', 'NDWI', 'DSWI']

In [11]:
data_red = data_base[numerics + add_meteo_known + add_SAT_known + cat_ord_miss + cat_strict]

Traitement des données catégorielles ordonnées en numériques (gestion des "-1" éventuels) :

In [12]:
for cat in cat_ord_miss:
  data_red[cat] = data_red[cat].apply(lambda v : int(v) if v!=-1 else np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_red[cat] = data_red[cat].apply(lambda v : int(v) if v!=-1 else np.nan)


In [13]:
for i in range(len(data_red)):
    for j in range(len(data_red.columns)):
        if np.isnan(data_red.iloc[i,j]) or data_red.iloc[i,j]==np.nan:
            next_value = copy.copy(data_red.iloc[i+1,j])
            data_red.iloc[i,j] = next_value

In [14]:
numerics_features = numerics + add_meteo_known + add_SAT_known

In [15]:
preprocessor_past = ColumnTransformer(
    [("num", KNNImputer(), numerics_features),
    ('ord_cat', SimpleImputer(strategy='most_frequent'), cat_ord_miss),
     ("cat_strict", SimpleImputer(strategy='most_frequent'), cat_strict)])

In [16]:
data_red_past = data_red.loc[data_red['LFI']!=4,:]
data_red_future = data_red.loc[data_red['LFI']==4,:]

In [17]:
data_part_past = pd.DataFrame(preprocessor_past.fit_transform(data_red_past), columns=data_red.columns)

In [18]:
data_red_clean = pd.concat([data_part_past, data_red_future], axis=0)

In [19]:
data_red_clean.sort_values(['PARCELLE','LFI'], inplace=True)

In [20]:
data_red_clean.drop('PARCELLE', axis=1, inplace=True)

In [21]:
numerics_features.pop(0)

'PARCELLE'

In [22]:
numerics_transforms_tot = Pipeline(
    [("imputer", SimpleImputer()),
    ('encoder',StandardScaler())
])
categorials_transforms_tot = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(drop="first"))
])

ordinal_cat_transforms_tot = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ('encoder',OrdinalEncoder())
])

preprocessor_tot = ColumnTransformer(
    [("num", numerics_transforms_tot, numerics_features),
    ("ord_cat", ordinal_cat_transforms_tot, cat_ord_miss),
     ("cat_strict", categorials_transforms_tot, cat_strict)])

In [19]:
#data_red['ORIENTATION'] = data_red['ORIENTATION'].map({'N':0,'NE':1,'E':2,'SE':3,'S':4,'SO':5,'O':6,'NO':7})
#data_red['ORIENTATION_f'] = data_red['ORIENTATION_f'].map({'N':0,'NE':1,'E':2,'SE':3,'S':4,'SO':5,'O':6,'NO':7})


In [23]:
X_train, X_test = train_test_split(data_red_clean, test_size=2000, shuffle=False, random_state=2)

In [24]:
X_train = preprocessor_tot.fit_transform(X_train)
X_test = preprocessor_tot.transform(X_test)

In [25]:
list_features_in = []
for feat in (numerics_features + cat_ord_miss):
  list_features_in.append(feat)
for cat in cat_strict:
  nb_lab = len(data_red[cat].unique())-1
  for i in range(nb_lab):
    list_features_in.append(f'{cat}_{i}')

In [26]:
df_train = pd.DataFrame(X_train, columns=list_features_in)
df_test = pd.DataFrame(X_test, columns=list_features_in)

In [27]:
df_train

Unnamed: 0,LFI,UNIT_ACCR,H_D,AI,SDI,AGE_PPL,ALT,TIGES_VIV_H,SURF_TER_HA,FEUILL_PER,...,DEG_FERMETURE_10,STR_PPL_0,STR_PPL_1,STR_PPL_2,STR_PPL_3,RELIEF_0,RELIEF_1,RELIEF_2,RELIEF_3,RELIEF_4
0,-1.341641,0.389518,-2.357521e-01,0.595565,-0.109457,-0.399088,-1.315250,0.450609,-0.284712,0.846990,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.447214,0.389518,7.387722e-01,0.595565,0.278931,-0.312550,-1.315250,0.643619,0.095678,0.763643,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.447214,0.182919,8.797411e-01,0.749904,0.741438,-0.572165,-1.315250,0.868796,0.557136,0.763643,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.341641,0.630551,8.857496e-01,-1.357143,0.782945,-0.485626,-1.315250,0.707955,0.646517,1.180378,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-1.341641,0.263263,-4.114060e-01,1.183067,0.836311,0.552834,-1.699120,-0.160587,1.045096,0.819208,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7607,1.341641,-0.310624,1.429520e+00,-0.453480,-0.269555,0.000000,1.300293,0.675787,-0.487380,-0.903298,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
7608,-1.341641,-0.253236,-1.957827e+00,0.226291,0.320438,0.795141,1.296270,0.442341,0.191296,-0.819951,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
7609,-0.447214,-0.253236,-1.444781e+00,0.226291,0.044713,2.456678,1.296270,-0.192610,0.059302,-0.903298,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
7610,0.447214,-0.069592,-1.143951e+00,0.474640,1.088318,2.456678,1.296270,0.289287,1.155784,-0.903298,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [28]:
ds_train = tf.keras.utils.timeseries_dataset_from_array(
    data=X_train,
    targets=df_train[TARGET][3:],
    sequence_length=3,
    sequence_stride=4,
    shuffle=False,
    batch_size=32)

  dataset = tf.data.Dataset.from_tensors(array[start_index:end_index])


In [29]:
for batch in ds_train.take(1):
  inputs, targets = batch

In [30]:
inputs[0]

<tf.Tensor: shape=(3, 65), dtype=float64, numpy=
array([[-1.34164079,  0.38951802, -0.23575207,  0.595565  , -0.10945651,
        -0.39908786, -1.3152496 ,  0.4506092 , -0.28471247,  0.84699012,
        -0.76347084,  1.48765271, -0.48237455,  0.64459185, -1.30976505,
        -0.75013904, -1.70349493, -1.65952795,  0.40569241,  0.86018684,
         0.79631398, -0.41554427,  0.4174749 ,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ],
 

In [31]:
targets

<tf.Tensor: shape=(32,), dtype=float64, numpy=
array([1., 5., 4., 4., 3., 4., 2., 3., 4., 2., 5., 1., 4., 2., 4., 2., 2.,
       3., 2., 1., 3., 3., 5., 5., 1., 5., 3., 3., 4., 3., 2., 4.])>

In [32]:
true_values = []
for i in range(len(X_train)//4):
    true_values.append(df_train.iloc[i*4+3,:][TARGET])
true_values[0:10]

[1.0, 5.0, 4.0, 4.0, 3.0, 4.0, 2.0, 3.0, 4.0, 2.0]

In [33]:
ds_test = tf.keras.utils.timeseries_dataset_from_array(
    data=X_test,
    targets=df_test[TARGET][3:],
    sequence_length=3,
    sequence_stride=4,
    shuffle=False,
    batch_size=32)

In [34]:
model = tf.keras.models.Sequential([
        tf.keras.layers.GRU(64, input_shape=(3,65,), return_sequences=True),
        tf.keras.layers.GRU(32, return_sequences=True),
        tf.keras.layers.GRU(16, return_sequences=False),
        tf.keras.layers.Dense(6, "softmax")
    ])

In [35]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 3, 64)             25152     
                                                                 
 gru_1 (GRU)                 (None, 3, 32)             9408      
                                                                 
 gru_2 (GRU)                 (None, 16)                2400      
                                                                 
 dense (Dense)               (None, 6)                 102       
                                                                 
Total params: 37,062
Trainable params: 37,062
Non-trainable params: 0
_________________________________________________________________


In [36]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=tf.keras.metrics.SparseCategoricalAccuracy())

In [37]:
model.fit(ds_train, epochs=5, validation_data=ds_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x20b84339a00>

In [38]:
true_values = []
predictions = []
for i in range(int(len(X_test)/4)):
    true_values.append(df_test.iloc[i*4+3,:][TARGET])
    predictions.append(np.argmax(np.array(model(tf.expand_dims(X_test[i*4:i*4+3], axis=0)))))


In [39]:
accuracy_score(true_values, predictions)

0.584

In [40]:
f1_score(true_values, predictions, average='micro')

0.584

In [41]:
model.layers[0].trainable_variables[0]

<tf.Variable 'gru/gru_cell/kernel:0' shape=(65, 192) dtype=float32, numpy=
array([[ 0.01048008, -0.04823827,  0.01822506, ..., -0.10715173,
        -0.07404789, -0.02912361],
       [-0.07027055,  0.1111234 ,  0.18567605, ...,  0.0129995 ,
        -0.04067235,  0.07406255],
       [ 0.03629403,  0.02296891,  0.00043701, ..., -0.05024128,
         0.11198583,  0.11958723],
       ...,
       [ 0.0945673 ,  0.04020444, -0.20314421, ..., -0.08203019,
        -0.03287517, -0.02107593],
       [ 0.04529237,  0.1396778 ,  0.0677603 , ...,  0.00675721,
        -0.01603221, -0.04973365],
       [-0.06834095,  0.04153753,  0.19447212, ..., -0.0162204 ,
         0.06791436,  0.02335494]], dtype=float32)>

In [42]:
coeff_mean = []
for i in range(65):
    coeff_mean.append(np.mean(model.layers[0].trainable_variables[0][i]))

In [43]:
df_coef = pd.DataFrame(coeff_mean, columns=['Coeff'], index= list_features_in)

In [44]:
fig = px.bar(df_coef['Coeff'], title=f"Features importance for target : {TARGET} with Lasso Linear Regression")
fig.show()