In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.linear_model import Lasso, LogisticRegression

In [244]:
data_merge = pd.read_excel('/content/drive/MyDrive/big_merge_V2_meteo_SAT.xlsx', sheet_name='Sheet1')

In [245]:
data_merge.drop('Unnamed: 0', axis=1, inplace=True)

Target definition :

In [246]:
TARGET =  'SURF_TER_HA'

In [247]:
fig = px.histogram(data_merge[TARGET], nbins=100)
fig.show()

In [248]:
data_red = data_merge.loc[data_merge[TARGET]!=np.nan,:]
data_red = data_red.loc[data_merge[TARGET]!=-1,:]

In [249]:
len(data_red)

9612

In [250]:
Y = data_red[TARGET]

In [251]:
Y

0       27.79
1       35.11
2       45.71
3       43.99
4       53.44
        ...  
9607    24.94
9608    17.05
9609    11.48
9610    26.58
9611    26.87
Name: SURF_TER_HA, Length: 9612, dtype: float64

Particular preprocessing for features :

In [252]:
data_red['LFI'] = data_red['LFI'].map({'LFI1':1,
                                       'LFI2':2,
                                       'LFI3':3,
                                       'LFI4':4})

preprocess LFI

Selection of features :

In [282]:
targets_cat__ord_feat = ['DEGRAD_PPL', 'TAUX_COUV_RAJ' ] # enlever :

In [283]:
targets_numeric_feat = ['PERF_CROI', '25_GRID_PER', 'UNIT_VOL_BOIS_MANQUANT', 'UNIT_ACCR'] #'SURF_TER_HA'

In [284]:
cat_strict = ['PRODREG','UNIT_VEG_GROS','MODE_REGEN','INTENSITE_EXPLOIT','TRACES_FEU','NIV_DEV','RELIEF','DEG_FERMETURE','STR_PPL', 'TYPE_FORET305', 'TYP_RAJ_PPL','TAUX_COUV_RAJ_ASS', 'ESPECE_DOM', 'TYPE_FORET'] #

In [285]:
cat_ord_miss = ['TAILLE_PPL', 'MELANGE', 'HT_VEG', 'LFI'] # enlever : 

In [315]:
numerical = ['ALT','SLOPE25','QUAL_STATION','AGE_PPL','DIV_STR_PPL','DDOM','TIGES_VIV_H', 'EPIC_PER',	'SAPIN_PER', 'PIN_PER', 'MELEZ_PER'	,'ARO_PER', 'OTHER_CONIF_PER'	,'HETR_PER' ,'ERAB_PER', 'FREN_PER', 'CHEN_PER', 'DGANTCA', 'OTHER_FEUILL_PER',	'CONIF_PER_STD', 'ARBUST_PER', 'FEUILL_PER', 'CONIF_PER', 'PROCESS_SILVA', 'V_ALL',	'V_FICHT', 'V_TANNE',	'V_FOEHR', 'V_LARCH',	'V_ARVEN', 'V_UENDH',	'V_BUCHE', 'V_AHORN',	'V_ESCHE',	'V_EICHE', 'V_CASTA',	'V_UELBH', 'DBH', 'HAUTEUR_ARBRE', 'AGE_ARBRE', 'PRCP', 'TAVE_AVG',	'TAVE', 'TAVE_GROWTH', 'PRCP_S_S',	'PRCP_G_S', 'NDVI', 'EVI', 'NDMI', 'NDWI', 'DSWI'] # enlever : 'SDI'

Preprocessing for cat_ord_miss :

In [316]:
for cat in cat_ord_miss:
  data_red[cat] = data_red[cat].apply(lambda v : v if v!=-1 else np.nan)

In [317]:
for cat in cat_ord_miss:
  print(data_red[cat].dtypes)

float64
float64
int64
int64


In [318]:
numerics_feats = numerical + targets_numeric_feat + cat_ord_miss
categorical_feats = cat_strict + targets_cat__ord_feat

In [319]:
len(numerics_feats)

59

In [320]:
len(categorical_feats)

16

In [321]:
data_red = data_red[[cat for cat in numerics_feats + categorical_feats]]

In [322]:
numerics_transforms = Pipeline(
    [('imputer',KNNImputer()),
    ('encoder',StandardScaler())
])
categorials_transforms = Pipeline([
    ('imputer',KNNImputer()),
    ('encoder',OneHotEncoder(drop="first"))
])

preprocessor = ColumnTransformer(
    [("num", numerics_transforms, numerics_feats),
     ("cat", categorials_transforms, categorical_feats)])

In [323]:
X_train, X_test, y_train, y_test = train_test_split(data_red, Y, test_size=0.2, random_state=2)

In [324]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [325]:
np.shape(X_train)

(7689, 166)

In [326]:
model =  Lasso(max_iter=10000)

In [327]:
model.get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'positive', 'precompute', 'random_state', 'selection', 'tol', 'warm_start'])

In [353]:
params = {
    'alpha':[0.001, 0.005, 0.01]
}

In [354]:
grid = GridSearchCV(model, param_grid=params, scoring='r2', verbose=1)

In [355]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


GridSearchCV(estimator=Lasso(max_iter=10000),
             param_grid={'alpha': [0.001, 0.005, 0.01]}, scoring='r2',
             verbose=1)

In [356]:
grid.best_estimator_

Lasso(alpha=0.001, max_iter=10000)

In [357]:
y_pred = grid.best_estimator_.predict(X_train)
y_pred_test = grid.best_estimator_.predict(X_test)

In [358]:
r2_score(y_train, y_pred)

0.9672918837883745

In [359]:
r2_score(y_test, y_pred_test)

0.9674569003815918

In [360]:
train_scores = cross_val_score(grid.best_estimator_, X_train, y_train, cv=5)
test_scores = cross_val_score(grid.best_estimator_, X_test, y_test, cv=5)
print(f'Train score mean : {np.mean(train_scores)}')
print(f'Train score std : {np.std(train_scores)}')
print(f'Test score mean : {np.mean(test_scores)}')
print(f'Test score std : {np.std(test_scores)}')


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.675e+01, tolerance: 5.493e+01



Train score mean : 0.964953244104159
Train score std : 0.012627911622180658
Test score mean : 0.9597048828733581
Test score std : 0.018146197911048507


In [361]:
len(grid.best_estimator_.coef_)

166

In [362]:
list_features_in = []
for feat in numerics_feats:
  list_features_in.append(feat)
for cat in categorical_feats:
  nb_lab = len(data_red[cat].unique())-1
  for i in range(nb_lab):
    list_features_in.append(f'{cat}_{i}')

In [363]:
df_coef = pd.DataFrame(grid.best_estimator_.coef_, index=list_features_in, columns=['Coeff'])

In [364]:
fig = px.bar(df_coef['Coeff'], title=f"Features importance for target : {TARGET} with Lasso Linear Regression")
fig.show()