In [62]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.linear_model import Lasso, LogisticRegression, RidgeClassifier

In [4]:
data_merge = pd.read_excel('big_merge_V2_meteo_SAT.xlsx')

In [5]:
data_merge.drop('Unnamed: 0', axis=1, inplace=True)

Target definition :

In [87]:
TARGET =  'DEGRAD_PPL'

In [88]:
fig = px.histogram(data_merge[TARGET], nbins=100)
fig.show()

In [89]:
data_red = data_merge.loc[data_merge[TARGET]!=np.nan,:]
data_red = data_red.loc[data_merge[TARGET]!=-1,:]

In [90]:
len(data_red)

7111

In [91]:
Y = data_red[TARGET]

In [92]:
Y.value_counts()

2    2894
1    2601
3    1060
4     378
5     111
6      67
Name: DEGRAD_PPL, dtype: int64

Particular preprocessing for features :

In [93]:
data_red['LFI'] = data_red['LFI'].map({'LFI1':1,
                                       'LFI2':2,
                                       'LFI3':3,
                                       'LFI4':4})

preprocess LFI

Selection of features :

In [156]:
targets_cat__ord_feat = ['TAUX_COUV_RAJ'] #'TAUX_COUV_RAJ'

In [157]:
targets_numeric_feat = ['PERF_CROI','SURF_TER_HA', '25_GRID_PER', 'UNIT_VOL_BOIS_MANQUANT']

In [158]:
cat_strict = ['PRODREG','MODE_REGEN','INTENSITE_EXPLOIT','TRACES_FEU','NIV_DEV','RELIEF','STR_PPL', 'TYPE_FORET305', 'ESPECE_DOM'] #'TYP_RAJ_PPL','TAUX_COUV_RAJ_ASS'

In [159]:
cat_ord_miss = ['TAILLE_PPL', 'MELANGE', 'HT_VEG'] # enlever : 'LFI'

In [160]:
numerical = ['ALT','SLOPE25','QUAL_STATION','AGE_PPL','DIV_STR_PPL','DDOM','TIGES_VIV_H', 'SDI','ARBUST_PER', 'FEUILL_PER', 'CONIF_PER', 'DBH', 'HAUTEUR_ARBRE', 'AGE_ARBRE', 'PRCP', 'TAVE_AVG',	'TAVE', 'TAVE_GROWTH', 'PRCP_S_S',	'PRCP_G_S', 'NDVI', 'EVI', 'NDMI', 'NDWI', 'DSWI'] # enlever : 

Preprocessing for cat_ord_miss :

In [161]:
for cat in cat_ord_miss:
  data_red[cat] = data_red[cat].apply(lambda v : v if v!=-1 else np.nan)

In [162]:
for cat in cat_ord_miss:
  print(data_red[cat].dtypes)

float64
float64
int64


In [163]:
numerics_feats = numerical + targets_numeric_feat + cat_ord_miss
categorical_feats = cat_strict + targets_cat__ord_feat

In [164]:
len(numerics_feats)

32

In [165]:
len(categorical_feats)

10

In [166]:
data_red = data_red[[cat for cat in numerics_feats + categorical_feats]]

In [167]:
numerics_transforms = Pipeline(
    [('imputer',KNNImputer()),
    ('encoder',StandardScaler())
])
categorials_transforms = Pipeline([
    ('imputer',KNNImputer()),
    ('encoder',OneHotEncoder(drop="first"))
])

preprocessor = ColumnTransformer(
    [("num", numerics_transforms, numerics_feats),
     ("cat", categorials_transforms, categorical_feats)])

In [168]:
X_train, X_test, y_train, y_test = train_test_split(data_red, Y, test_size=0.2, random_state=2, stratify=Y)

In [169]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [170]:
np.shape(X_train)

(5688, 89)

In [171]:
y_train

5164    2
1983    3
5679    1
8542    3
6104    4
       ..
4461    2
3034    1
8668    3
9281    3
8108    1
Name: DEGRAD_PPL, Length: 5688, dtype: int64

In [172]:
model =  Lasso(max_iter=10000)

In [179]:
params = {
    'alpha':[0.0001, 0.005, 0.001]
}

In [180]:
grid = GridSearchCV(model, param_grid=params, scoring='r2', verbose=1)

In [181]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


GridSearchCV(estimator=Lasso(max_iter=10000),
             param_grid={'alpha': [0.0001, 0.005, 0.001]}, scoring='r2',
             verbose=1)

In [182]:
grid.best_estimator_

Lasso(alpha=0.0001, max_iter=10000)

In [183]:
y_pred = grid.best_estimator_.predict(X_train)
y_pred_test = grid.best_estimator_.predict(X_test)

In [184]:
r2_score(y_train, y_pred)

0.3666446909240325

In [185]:
r2_score(y_test, y_pred_test)

0.3201670220904377

In [186]:
train_scores = cross_val_score(grid.best_estimator_, X_train, y_train, cv=5, scoring='r2')
test_scores = cross_val_score(grid.best_estimator_, X_test, y_test, cv=5, scoring='r2')
print(f'Train score mean : {np.mean(train_scores)}')
print(f'Train score std : {np.std(train_scores)}')
print(f'Test score mean : {np.mean(test_scores)}')
print(f'Test score std : {np.std(test_scores)}')

Train score mean : 0.3343349367510312
Train score std : 0.045258343605944766
Test score mean : 0.29890991826795726
Test score std : 0.047839878358685826


In [187]:
len(grid.best_estimator_.coef_)

89

In [188]:
preprocessor.feature_names_in_

array(['ALT', 'SLOPE25', 'QUAL_STATION', 'AGE_PPL', 'DIV_STR_PPL', 'DDOM',
       'TIGES_VIV_H', 'SDI', 'ARBUST_PER', 'FEUILL_PER', 'CONIF_PER',
       'DBH', 'HAUTEUR_ARBRE', 'AGE_ARBRE', 'PRCP', 'TAVE_AVG', 'TAVE',
       'TAVE_GROWTH', 'PRCP_S_S', 'PRCP_G_S', 'NDVI', 'EVI', 'NDMI',
       'NDWI', 'DSWI', 'PERF_CROI', 'SURF_TER_HA', '25_GRID_PER',
       'UNIT_VOL_BOIS_MANQUANT', 'TAILLE_PPL', 'MELANGE', 'HT_VEG',
       'PRODREG', 'MODE_REGEN', 'INTENSITE_EXPLOIT', 'TRACES_FEU',
       'NIV_DEV', 'RELIEF', 'STR_PPL', 'TYPE_FORET305', 'ESPECE_DOM',
       'TAUX_COUV_RAJ'], dtype=object)

In [189]:
list_features_in = []
for feat in numerics_feats:
  list_features_in.append(feat)
for cat in categorical_feats:
  nb_lab = len(data_red[cat].unique())-1
  for i in range(nb_lab):
    list_features_in.append(f'{cat}_{i}')

In [190]:
df_coef_inter = pd.DataFrame(grid.best_estimator_.coef_)

In [191]:
df_coef = pd.DataFrame(abs(df_coef_inter).sum(), columns=['Coeff'])

In [192]:
df_coef['Features'] = list_features_in

ValueError: Length of values (89) does not match length of index (1)

In [193]:
df_coef = pd.DataFrame(grid.best_estimator_.coef_, index=list_features_in, columns=['Coeff'])

In [194]:
fig = px.bar(df_coef['Coeff'], title=f"Features importance for target : {TARGET} with Lasso Linear Regression")
fig.show()