In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.linear_model import Lasso, LogisticRegression

In [2]:
data_merge = pd.read_excel('/content/drive/MyDrive/big_merge_V2_meteo_SAT.xlsx', sheet_name='Sheet1')

In [3]:
data_merge.drop('Unnamed: 0', axis=1, inplace=True)

Target definition :

In [40]:
TARGET =  'NIV_DEV'

In [41]:
fig = px.histogram(data_merge[TARGET], nbins=100)
fig.show()

In [42]:
data_red = data_merge.loc[data_merge[TARGET]!=np.nan,:]
data_red = data_red.loc[data_merge[TARGET]!='NaN',:]
data_red = data_red.loc[data_merge[TARGET]!=-1,:]

In [43]:
len(data_red)

9364

In [44]:
Y = data_red[TARGET]

In [45]:
Y

0       3
1       3
2       4
3       4
4       5
       ..
9607    4
9608    5
9609    3
9610    5
9611    5
Name: NIV_DEV, Length: 9364, dtype: int64

Particular preprocessing for features :

In [46]:
data_red['LFI'] = data_red['LFI'].map({'LFI1':1,
                                       'LFI2':2,
                                       'LFI3':3,
                                       'LFI4':4})

preprocess LFI

Selection of features :

In [84]:
targets_cat__ord_feat = ['DEGRAD_PPL'] # enlever : 

In [85]:
targets_numeric_feat = ['PERF_CROI', '25_GRID_PER', 'UNIT_VOL_BOIS_MANQUANT', 'SURF_TER_HA', 'TAUX_COUV_RAJ', 'UNIT_ACCR'] # 

In [86]:
cat_strict = ['PRODREG','UNIT_VEG_GROS','MODE_REGEN','INTENSITE_EXPLOIT','TRACES_FEU','RELIEF','DEG_FERMETURE','STR_PPL', 'TYPE_FORET305', 'TYP_RAJ_PPL','TAUX_COUV_RAJ_ASS', 'ESPECE_DOM', 'TYPE_FORET'] # enlevé : , 'NIV_DEV'

In [87]:
cat_ord_miss = ['TAILLE_PPL', 'MELANGE', 'HT_VEG', 'LFI'] # enlever : 

In [88]:
numerical = ['ALT','SLOPE25','QUAL_STATION','AGE_PPL','DIV_STR_PPL','DDOM','TIGES_VIV_H', 'EPIC_PER',	'SAPIN_PER', 'PIN_PER', 'MELEZ_PER'	,'ARO_PER', 'OTHER_CONIF_PER'	,'HETR_PER' ,'ERAB_PER', 'FREN_PER', 'CHEN_PER', 'DGANTCA', 'OTHER_FEUILL_PER',	'CONIF_PER_STD', 'ARBUST_PER', 'FEUILL_PER', 'CONIF_PER', 'PROCESS_SILVA', 'V_ALL',	'V_FICHT', 'V_TANNE',	'V_FOEHR', 'V_LARCH',	'V_ARVEN', 'V_UENDH',	'V_BUCHE', 'V_AHORN',	'V_ESCHE',	'V_EICHE', 'V_CASTA',	'V_UELBH', 'DBH', 'HAUTEUR_ARBRE', 'AGE_ARBRE', 'PRCP', 'TAVE_AVG',	'TAVE', 'TAVE_GROWTH', 'PRCP_S_S',	'PRCP_G_S', 'NDVI', 'EVI', 'NDMI', 'NDWI', 'DSWI'] # enlever : 'SDI'

Preprocessing for cat_ord_miss :

In [89]:
for cat in cat_ord_miss:
  data_red[cat] = data_red[cat].apply(lambda v : v if v!=-1 else np.nan)

In [90]:
for cat in cat_ord_miss:
  print(data_red[cat].dtypes)

float64
int64
int64
int64


In [91]:
numerics_feats = numerical + targets_numeric_feat + cat_ord_miss
categorical_feats = cat_strict + targets_cat__ord_feat

In [92]:
len(numerics_feats)

61

In [93]:
len(categorical_feats)

14

In [94]:
data_red = data_red[[cat for cat in numerics_feats + categorical_feats]]

In [95]:
numerics_transforms = Pipeline(
    [('imputer',KNNImputer()),
    ('encoder',StandardScaler())
])
categorials_transforms = Pipeline([
    ('imputer',KNNImputer()),
    ('encoder',OneHotEncoder(drop="first"))
])

preprocessor = ColumnTransformer(
    [("num", numerics_transforms, numerics_feats),
     ("cat", categorials_transforms, categorical_feats)])

In [96]:
X_train, X_test, y_train, y_test = train_test_split(data_red, Y, test_size=0.2, random_state=2)

In [97]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [98]:
np.shape(X_train)

(7491, 154)

In [99]:
model =  LogisticRegression(max_iter=10000)

In [100]:
params = {
    'C':[0.01, 0.05, 0.1]
}

In [101]:
grid = GridSearchCV(model, param_grid=params, scoring='r2', verbose=1)

In [102]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


GridSearchCV(estimator=LogisticRegression(max_iter=10000),
             param_grid={'C': [0.01, 0.05, 0.1]}, scoring='r2', verbose=1)

In [103]:
grid.best_estimator_

LogisticRegression(C=0.1, max_iter=10000)

In [104]:
y_pred = grid.best_estimator_.predict(X_train)
y_pred_test = grid.best_estimator_.predict(X_test)

In [105]:
r2_score(y_train, y_pred)

0.8828277652954506

In [106]:
r2_score(y_test, y_pred_test)

0.85596968736496

In [107]:
train_scores = cross_val_score(grid.best_estimator_, X_train, y_train, cv=5)
test_scores = cross_val_score(grid.best_estimator_, X_test, y_test, cv=5)
print(f'Train score mean : {np.mean(train_scores)}')
print(f'Train score std : {np.std(train_scores)}')
print(f'Test score mean : {np.mean(test_scores)}')
print(f'Test score std : {np.std(test_scores)}')

Train score mean : 0.9411289769503657
Train score std : 0.005531248243912271
Test score mean : 0.9369896613190731
Test score std : 0.014730702088888057


In [108]:
len(grid.best_estimator_.coef_)

6

In [109]:
list_features_in = []
for feat in numerics_feats:
  list_features_in.append(feat)
for cat in categorical_feats:
  nb_lab = len(data_red[cat].unique())-1
  for i in range(nb_lab):
    list_features_in.append(f'{cat}_{i}')

In [113]:
coeff_arr = pd.DataFrame(abs(grid.best_estimator_.coef_)).sum().to_numpy()

In [114]:
df_coef = pd.DataFrame(coeff_arr, index=list_features_in, columns=['Coeff'])

In [115]:
fig = px.bar(df_coef['Coeff'], title=f"Features importance for target : {TARGET} with Lasso Linear Regression")
fig.show()