In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import copy

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, accuracy_score, f1_score

import tensorflow as tf
import tensorflow_addons as tfa

 The versions of TensorFlow you are currently using is 2.11.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
TARGET = 'UNIT_ACCR'

In [3]:
data_base = pd.read_excel('./big_merge_V2_meteo_SAT.xlsx').drop('Unnamed: 0', axis=1)

In [4]:
data_base['LFI'] = data_base['LFI'].map({'LFI1' : 1,
                                               'LFI2' : 2,
                                               'LFI3' : 3,
                                               'LFI4' : 4 })

In [5]:
data_base.sort_values(['PARCELLE', 'LFI'], inplace=True)

In [6]:
data_base.loc[(data_base[TARGET]==-1) & (data_base['LFI']==3),:]

Unnamed: 0,PARCELLE,LAT,LON,ALT,PRODREG,HT_VEG,DATE,SLOPE25,ASPECT25,ORIENTATION,...,TAVE_AVG,TAVE,TAVE_GROWTH,PRCP_S_S,PRCP_G_S,NDVI,EVI,NDMI,NDWI,DSWI
5662,121680,46.493994,8.728201,1119.142614,5,4,2006-08-16,43.458138,74.709961,E,...,8.9336,15.19438,15.41815,102.8456,50.950124,0.2388,0.0037,0.0668,-0.2424,0.3222
6826,139882,46.328846,8.984039,505.677744,5,2,2006-07-17,102.830132,252.158859,O,...,7.7581,21.58064,14.82531,111.764364,54.721718,0.5708,0.0161,0.1554,-0.5078,0.6716
6993,141361,46.321275,7.607429,916.033298,4,2,2006-09-26,59.640934,160.228271,S,...,7.7967,16.13763,14.52905,115.40524,46.698399,0.3995,0.0123,0.0118,-0.388,0.3978
7167,143006,46.302158,7.970794,990.744351,4,3,2006-04-18,43.76609,349.334534,NO,...,9.4048,8.387802,15.11201,110.164174,50.445878,0.3338,0.0081,0.1419,-0.305,0.3759
7737,146688,46.247722,8.994704,840.439572,5,2,2006-11-23,83.602562,128.077576,SE,...,8.61,6.20239,14.6746,112.858573,53.017572,0.6496,0.0229,0.1078,-0.5694,0.713


PREPROCESSING _ Code base for models temporal predictions

Ici, features engineering (création de nouvelles features à partir de la liste connues):

In [7]:
# adding aridity index
data_base["AI"] = data_base['PRCP_GROWTH'] / data_base['TAVE_GROWTH']
# adding H/D index
data_base["H_D"] = data_base['HAUTEUR_ARBRE'] / data_base['DBH']


In [8]:
cat_strict = ['PRODREG', 'ESPECE_DOM', 'TYP_RAJ_PPL', 'DEG_FERMETURE', 'STR_PPL', 'RELIEF'] #exemple 'PRODREG', 'ESPECE_DOM', 'TYP_RAJ_PPL', 'DEG_FERMETURE', 'STR_PPL', 'RELIEF'
cat_ord_miss = ['TAUX_COUV_RAJ', 'HT_VEG'] #exemple 'TAILLE_PPL', 'MELANGE', 'QUAL_STATION', 'TAUX_COUV_RAJ', 'SURF_TROU_AER', 'HT_VEG'
numerics = ['PARCELLE', 'LFI', 'UNIT_ACCR','H_D','AI','SDI', 'AGE_PPL','ALT', 'TIGES_VIV_H', 'SURF_TER_HA', 'FEUILL_PER', 'CONIF_PER','PERF_CROI'] #exemple

add_meteo_known = ['PRCP', 'TAVE_AVG',	'TAVE', 'TAVE_GROWTH', 'PRCP_S_S',	'PRCP_G_S']

add_SAT_known = ['NDVI', 'EVI', 'NDMI', 'NDWI', 'DSWI']

In [9]:
data_red = data_base[numerics + add_meteo_known + add_SAT_known + cat_ord_miss + cat_strict]

Traitement des données catégorielles ordonnées en numériques (gestion des "-1" éventuels) :

In [10]:
for cat in cat_ord_miss:
  data_red[cat] = data_red[cat].apply(lambda v : int(v) if v!=-1 else np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_red[cat] = data_red[cat].apply(lambda v : int(v) if v!=-1 else np.nan)


In [11]:
for i in range(len(data_red)):
    for j in range(len(data_red.columns)):
        if np.isnan(data_red.iloc[i,j]) or data_red.iloc[i,j]==np.nan:
            next_value = copy.copy(data_red.iloc[i+1,j])
            data_red.iloc[i,j] = next_value

In [12]:
numerics_features = numerics + add_meteo_known + add_SAT_known

In [13]:
preprocessor_past = ColumnTransformer(
    [("num", KNNImputer(), numerics_features),
    ('ord_cat', SimpleImputer(strategy='most_frequent'), cat_ord_miss),
     ("cat_strict", SimpleImputer(strategy='most_frequent'), cat_strict)])

In [14]:
data_red_past = data_red.loc[data_red['LFI']!=4,:]
data_red_future = data_red.loc[data_red['LFI']==4,:]

In [15]:
data_part_past = pd.DataFrame(preprocessor_past.fit_transform(data_red_past), columns=data_red.columns)

In [16]:
data_red_clean = pd.concat([data_part_past, data_red_future], axis=0)

In [17]:
data_red_clean.sort_values(['PARCELLE','LFI'], inplace=True)

In [18]:
data_red_clean.drop('PARCELLE', axis=1, inplace=True)

In [19]:
numerics_features.pop(0)

'PARCELLE'

In [20]:
numerics_transforms_tot = Pipeline(
    [("imputer", SimpleImputer()),
    ('encoder',StandardScaler())
])
categorials_transforms_tot = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(drop="first"))
])

ordinal_cat_transforms_tot = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ('encoder',OrdinalEncoder())
])

preprocessor_tot = ColumnTransformer(
    [("num", numerics_transforms_tot, numerics_features),
    ("ord_cat", ordinal_cat_transforms_tot, cat_ord_miss),
     ("cat_strict", categorials_transforms_tot, cat_strict)])

In [19]:
#data_red['ORIENTATION'] = data_red['ORIENTATION'].map({'N':0,'NE':1,'E':2,'SE':3,'S':4,'SO':5,'O':6,'NO':7})
#data_red['ORIENTATION_f'] = data_red['ORIENTATION_f'].map({'N':0,'NE':1,'E':2,'SE':3,'S':4,'SO':5,'O':6,'NO':7})


In [21]:
X_train, X_test = train_test_split(data_red_clean, test_size=2000, shuffle=False, random_state=2)

In [22]:
X_train = preprocessor_tot.fit_transform(X_train)
X_test = preprocessor_tot.transform(X_test)

In [23]:
list_features_in = []
for feat in (numerics_features + cat_ord_miss):
  list_features_in.append(feat)
for cat in cat_strict:
  nb_lab = len(data_red[cat].unique())-1
  for i in range(nb_lab):
    list_features_in.append(f'{cat}_{i}')

In [24]:
df_train = pd.DataFrame(X_train, columns=list_features_in)
df_test = pd.DataFrame(X_test, columns=list_features_in)

In [25]:
ds_train = tf.keras.utils.timeseries_dataset_from_array(
    data=X_train,
    targets=df_train[TARGET][3:],
    sequence_length=3,
    sequence_stride=4,
    shuffle=False,
    batch_size=32)

  dataset = tf.data.Dataset.from_tensors(array[start_index:end_index])


In [26]:
for batch in ds_train.take(1):
  inputs, targets = batch

In [27]:
inputs[0]

<tf.Tensor: shape=(3, 64), dtype=float64, numpy=
array([[-1.34164079,  0.38951802, -0.23575207,  0.595565  , -0.10945651,
        -0.39908786, -1.3152496 ,  0.4506092 , -0.28471247,  0.84699012,
        -0.76347084,  1.48765271, -0.48237455,  0.64459185, -1.30976505,
        -0.75013904, -1.70349493, -1.65952795,  0.40569241,  0.86018684,
         0.79631398, -0.41554427,  0.4174749 ,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ],
       [-0.447

In [28]:
targets

<tf.Tensor: shape=(32,), dtype=float64, numpy=
array([ 0.63055065, -0.058114  , -2.0896747 ,  2.01935768, -0.59756797,
       -1.17145517, -0.16141369,  0.36656254, -0.88451157, -0.72382315,
        0.87158328,  0.86010553, -0.87303383, -0.40244632, -0.68938992,
        1.05522718,  0.18291863,  1.05522718,  0.45838449,  1.44547048,
        0.74532809,  1.08966041,  0.09109668, -0.82712285,  3.14417661,
        0.11405217,  1.0093162 , -1.65352042, -0.10402497, -0.17289144,
       -0.19584692, -1.32066585])>

In [29]:
true_values = []
for i in range(len(X_train)//4):
    true_values.append(df_train.iloc[i*4+3,:][TARGET])
true_values[0:10]

[0.6305506497883541,
 -0.05811399566571488,
 -2.089674699755218,
 2.019357684787393,
 -0.5975679679380689,
 -1.1714551724831261,
 -0.16141369248382526,
 0.36656253569762753,
 -0.8845115702105976,
 -0.7238231529379815]

In [30]:
ds_test = tf.keras.utils.timeseries_dataset_from_array(
    data=X_test,
    targets=df_test[TARGET][3:],
    sequence_length=3,
    sequence_stride=4,
    shuffle=False,
    batch_size=32)

In [31]:
model = tf.keras.models.Sequential([
        tf.keras.layers.GRU(128, input_shape=(3,64,), return_sequences=True),
        tf.keras.layers.GRU(64, return_sequences=True),
        tf.keras.layers.GRU(32, return_sequences=False),
        tf.keras.layers.Dense(8, "linear"),
        tf.keras.layers.Dense(1, "linear")
    ])

In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 3, 128)            74496     
                                                                 
 gru_1 (GRU)                 (None, 3, 64)             37248     
                                                                 
 gru_2 (GRU)                 (None, 32)                9408      
                                                                 
 dense (Dense)               (None, 8)                 264       
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 121,425
Trainable params: 121,425
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=tfa.metrics.RSquare())

In [35]:
model.fit(ds_train, epochs=5, validation_data=ds_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1b31b1bfe80>

In [99]:
true_values = []
predictions = []
for i in range(int(len(X_test)/4)):
    true_values.append(df_test.iloc[i*4+3,:][TARGET])
    predictions.append(np.array(model(tf.expand_dims(X_test[i*4:i*4+3], axis=0))))


In [103]:
r2_score(true_values, np.array(predictions).ravel())

0.767219143553783

In [104]:
model.layers[0].trainable_variables[0]

<tf.Variable 'gru_32/gru_cell_32/kernel:0' shape=(64, 384) dtype=float32, numpy=
array([[ 0.06008945, -0.0208231 ,  0.05647784, ..., -0.0252794 ,
         0.07640408, -0.04780112],
       [-0.10494728, -0.06508765,  0.11623695, ..., -0.03886441,
        -0.00513382, -0.04098175],
       [ 0.02636033, -0.07488547, -0.03994165, ...,  0.0994485 ,
        -0.06447626,  0.08275823],
       ...,
       [ 0.05040737, -0.09723269,  0.10408894, ..., -0.04169362,
         0.0207759 , -0.09330749],
       [-0.07891459, -0.1026121 ,  0.11228173, ..., -0.07103606,
         0.04577089,  0.11043668],
       [-0.04790527, -0.14083073, -0.03386193, ..., -0.00904732,
        -0.09307924,  0.09162017]], dtype=float32)>

In [108]:
coeff_mean = []
for i in range(64):
    coeff_mean.append(np.mean(model.layers[0].trainable_variables[0][i]))

In [109]:
df_coef = pd.DataFrame(coeff_mean, columns=['Coeff'], index= list_features_in)

In [110]:
fig = px.bar(df_coef['Coeff'], title=f"Features importance for target : {TARGET} with Lasso Linear Regression")
fig.show()