In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.linear_model import Lasso, LogisticRegression

In [2]:
data_merge = pd.read_excel('/content/drive/MyDrive/big_merge_V2_meteo_SAT.xlsx', sheet_name='Sheet1')

In [3]:
data_merge.drop('Unnamed: 0', axis=1, inplace=True)

Target definition :

In [89]:
TARGET =  'TAUX_COUV_RAJ'

In [90]:
fig = px.histogram(data_merge[TARGET], nbins=100)
fig.show()

In [91]:
data_red = data_merge.loc[data_merge[TARGET]!=np.nan,:]
data_red = data_red.loc[data_merge[TARGET]!=-1,:]

In [92]:
len(data_red)

7174

In [93]:
Y = data_red[TARGET]

In [94]:
Y

1       2
2       2
3       2
4       6
6       4
       ..
9605    2
9606    4
9608    2
9610    2
9611    2
Name: TAUX_COUV_RAJ, Length: 7174, dtype: int64

Particular preprocessing for features :

In [95]:
data_red['LFI'] = data_red['LFI'].map({'LFI1':1,
                                       'LFI2':2,
                                       'LFI3':3,
                                       'LFI4':4})

preprocess LFI

Selection of features :

In [103]:
targets_cat__ord_feat = ['DEGRAD_PPL'] #'TAUX_COUV_RAJ'

In [104]:
targets_numeric_feat = ['PERF_CROI','SURF_TER_HA', '25_GRID_PER', 'UNIT_VOL_BOIS_MANQUANT', 'UNIT_ACCR']

In [105]:
cat_strict = ['PRODREG','UNIT_VEG_GROS','MODE_REGEN','INTENSITE_EXPLOIT','TRACES_FEU','NIV_DEV','RELIEF','DEG_FERMETURE','STR_PPL', 'TYPE_FORET305', 'ESPECE_DOM', 'TYPE_FORET'] #'TYP_RAJ_PPL','TAUX_COUV_RAJ_ASS'

In [106]:
cat_ord_miss = ['TAILLE_PPL', 'MELANGE', 'HT_VEG'] # enlever : 'LFI'

In [107]:
numerical = ['ALT','SLOPE25','QUAL_STATION','AGE_PPL','DIV_STR_PPL','DDOM','TIGES_VIV_H', 'SDI', 'EPIC_PER',	'SAPIN_PER', 'PIN_PER', 'MELEZ_PER'	,'ARO_PER', 'OTHER_CONIF_PER'	,'HETR_PER' ,'ERAB_PER', 'FREN_PER', 'CHEN_PER', 'DGANTCA', 'OTHER_FEUILL_PER',	'CONIF_PER_STD', 'ARBUST_PER', 'FEUILL_PER', 'CONIF_PER', 'PROCESS_SILVA', 'V_ALL',	'V_FICHT', 'V_TANNE',	'V_FOEHR', 'V_LARCH',	'V_ARVEN', 'V_UENDH',	'V_BUCHE', 'V_AHORN',	'V_ESCHE',	'V_EICHE', 'V_CASTA',	'V_UELBH', 'DBH', 'HAUTEUR_ARBRE', 'AGE_ARBRE', 'PRCP', 'TAVE_AVG',	'TAVE', 'TAVE_GROWTH', 'PRCP_S_S',	'PRCP_G_S', 'NDVI', 'EVI', 'NDMI', 'NDWI', 'DSWI'] # enlever : 

Preprocessing for cat_ord_miss :

In [108]:
for cat in cat_ord_miss:
  data_red[cat] = data_red[cat].apply(lambda v : v if v!=-1 else np.nan)

In [109]:
for cat in cat_ord_miss:
  print(data_red[cat].dtypes)

float64
float64
int64


In [110]:
numerics_feats = numerical + targets_numeric_feat + cat_ord_miss
categorical_feats = cat_strict + targets_cat__ord_feat

In [111]:
len(numerics_feats)

60

In [112]:
len(categorical_feats)

13

In [113]:
data_red = data_red[[cat for cat in numerics_feats + categorical_feats]]

In [114]:
numerics_transforms = Pipeline(
    [('imputer',KNNImputer()),
    ('encoder',StandardScaler())
])
categorials_transforms = Pipeline([
    ('imputer',KNNImputer()),
    ('encoder',OneHotEncoder(drop="first"))
])

preprocessor = ColumnTransformer(
    [("num", numerics_transforms, numerics_feats),
     ("cat", categorials_transforms, categorical_feats)])

In [115]:
X_train, X_test, y_train, y_test = train_test_split(data_red, Y, test_size=0.2, random_state=2, stratify=Y)

In [116]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [117]:
np.shape(X_train)

(5739, 147)

In [118]:
y_train

4854    2
8044    2
6661    2
3066    1
1183    2
       ..
6704    3
8640    3
7449    3
7168    2
4207    3
Name: TAUX_COUV_RAJ, Length: 5739, dtype: int64

In [119]:
model =  LogisticRegression(max_iter=10000)

In [139]:
params = {
    'C':[0.08, 0.1, 0.12]
}

In [140]:
grid = GridSearchCV(model, param_grid=params, scoring='accuracy', verbose=1)

In [141]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


GridSearchCV(estimator=LogisticRegression(max_iter=10000),
             param_grid={'C': [0.08, 0.1, 0.12]}, scoring='accuracy',
             verbose=1)

In [142]:
grid.best_estimator_

LogisticRegression(C=0.1, max_iter=10000)

In [143]:
y_pred = grid.best_estimator_.predict(X_train)
y_pred_test = grid.best_estimator_.predict(X_test)

In [144]:
r2_score(y_train, y_pred)

0.3646601116018672

In [145]:
r2_score(y_test, y_pred_test)

0.330869982639466

In [146]:
train_scores = cross_val_score(grid.best_estimator_, X_train, y_train, cv=5)
test_scores = cross_val_score(grid.best_estimator_, X_test, y_test, cv=5)
print(f'Train score mean : {np.mean(train_scores)}')
print(f'Train score std : {np.std(train_scores)}')
print(f'Test score mean : {np.mean(test_scores)}')
print(f'Test score std : {np.std(test_scores)}')

Train score mean : 0.4361372949885932
Train score std : 0.00799429109109187
Test score mean : 0.41602787456445994
Test score std : 0.02002802489644588


In [147]:
len(grid.best_estimator_.coef_)

6

In [148]:
preprocessor.feature_names_in_

array(['ALT', 'SLOPE25', 'QUAL_STATION', 'AGE_PPL', 'DIV_STR_PPL', 'DDOM',
       'TIGES_VIV_H', 'SDI', 'EPIC_PER', 'SAPIN_PER', 'PIN_PER',
       'MELEZ_PER', 'ARO_PER', 'OTHER_CONIF_PER', 'HETR_PER', 'ERAB_PER',
       'FREN_PER', 'CHEN_PER', 'DGANTCA', 'OTHER_FEUILL_PER',
       'CONIF_PER_STD', 'ARBUST_PER', 'FEUILL_PER', 'CONIF_PER',
       'PROCESS_SILVA', 'V_ALL', 'V_FICHT', 'V_TANNE', 'V_FOEHR',
       'V_LARCH', 'V_ARVEN', 'V_UENDH', 'V_BUCHE', 'V_AHORN', 'V_ESCHE',
       'V_EICHE', 'V_CASTA', 'V_UELBH', 'DBH', 'HAUTEUR_ARBRE',
       'AGE_ARBRE', 'PRCP', 'TAVE_AVG', 'TAVE', 'TAVE_GROWTH', 'PRCP_S_S',
       'PRCP_G_S', 'NDVI', 'EVI', 'NDMI', 'NDWI', 'DSWI', 'PERF_CROI',
       'SURF_TER_HA', '25_GRID_PER', 'UNIT_VOL_BOIS_MANQUANT',
       'UNIT_ACCR', 'TAILLE_PPL', 'MELANGE', 'HT_VEG', 'PRODREG',
       'UNIT_VEG_GROS', 'MODE_REGEN', 'INTENSITE_EXPLOIT', 'TRACES_FEU',
       'NIV_DEV', 'RELIEF', 'DEG_FERMETURE', 'STR_PPL', 'TYPE_FORET305',
       'ESPECE_DOM', 'TYPE_FORET', 

In [149]:
list_features_in = []
for feat in numerics_feats:
  list_features_in.append(feat)
for cat in categorical_feats:
  nb_lab = len(data_red[cat].unique())-1
  for i in range(nb_lab):
    list_features_in.append(f'{cat}_{i}')

In [150]:
df_coef_inter = pd.DataFrame(grid.best_estimator_.coef_)

In [151]:
df_coef = pd.DataFrame(abs(df_coef_inter).sum(), columns=['Coeff'])

In [152]:
df_coef['Features'] = list_features_in

In [153]:
df_coef = df_coef.set_index('Features')

In [154]:
fig = px.bar(df_coef['Coeff'], title=f"Features importance for target : {TARGET} with Lasso Linear Regression")
fig.show()