In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Lasso, Ridge, LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_percentage_error, accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

In [2]:
data = pd.read_excel("/Users/lamothemarjory/Downloads/Projet Full Stack/big_merge_V2_meteo.xlsx")

In [3]:
to_drop = ["ETAGE", "ENDOMMAGEMENT", "NB_DEGAT_ARBRE", 'PERF_CROI', 'SURF_TER_HA', 'DEGRAD_PPL', '25_GRID_PER', 
            'VOL_BOIS_MANQUANT', 'ACCR', 'UNIT_ACCR', 'UNIT_VOL_BOIS_MANQUANT']
variables_to_keep = [col for col in data.columns if col not in to_drop]
data = data.loc[:,variables_to_keep]

In [4]:
numeric_features = ['ALT', 'SLOPE25', 'AGE_PPL', 'DIV_STR_PPL', 'DDOM', 'DEG_SUR_PER', 'FEUILL_PER', 'CONIF_PER', 'SURF_TROU_AER', 'DBH', 'HAUTEUR_ARBRE', 'AGE_ARBRE',
                    'PRCP_SUM', 'PRCP', 'PRCP_GROWTH', 'TAVE_AVG', 'TAVE', 'TAVE_GROWTH']
categorical_features = ['LFI','ORIENTATION', 'RELIEF', 'PRODREG', 'PERI_CHENAUX', 'PERI_COULEES', 'PERI_AVALANCH', 'PERI_CHUTES',
                        'MODE_REGEN', 'NIV_DEV', 'MELANGE', 'TYP_RAJ_PPL', 'ESPECE_DOM', 'FEU_RES', 'TYPE_FORET',
                        'TRACES_FEU', 'INTENSITE_EXPLOIT', 'INT_IFN2_IFN3', 'INT_IFN3-IFN4']
ordinal_categorical_features = ['HT_VEG', 'QUAL_STATION', 'REBOISEMENT_AN', 'TAILLE_PPL']

In [5]:
features_list = ['ALT', 'SLOPE25', 'AGE_PPL', 'DIV_STR_PPL', 'DDOM', 'DEG_SUR_PER', 'FEUILL_PER', 'CONIF_PER', 'SURF_TROU_AER', 'DBH', 'HAUTEUR_ARBRE', 'AGE_ARBRE',
                'PRCP_SUM', 'PRCP', 'PRCP_GROWTH', 'TAVE_AVG', 'TAVE', 'TAVE_GROWTH',
                
                'HT_VEG', 'QUAL_STATION', 'REBOISEMENT_AN', 'TAILLE_PPL',

                'LFI','ORIENTATION', 'RELIEF', 'PRODREG', 'PERI_CHENAUX', 'PERI_COULEES', 'PERI_AVALANCH', 'PERI_CHUTES',
                'MODE_REGEN', 'NIV_DEV', 'MELANGE', 'TYP_RAJ_PPL', 'ESPECE_DOM', 'FEU_RES', 'TYPE_FORET',
                'TRACES_FEU', 'INTENSITE_EXPLOIT', 'INT_IFN2_IFN3', 'INT_IFN3-IFN4']

target_variable = 'TAUX_COUV_RAJ'

X = data.drop(target_variable, axis = 1)
Y = data.loc[:,target_variable]


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify = Y)

In [7]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first'))
    ])

ordinal_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordi_encoder', OrdinalEncoder(encoded_missing_value=0))
    ])

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('ordi', ordinal_transformer, ordinal_categorical_features)
    ])

In [9]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
regressor = Lasso()
params = {
    'alpha': [0.000531, 0.000533, 0.000535, 0.000538, 0.00540]
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 3)
gridsearch.fit(X_train, Y_train)
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best R2 score : ", gridsearch.best_score_)

Best hyperparameters :  {'alpha': 0.000538}
Best R2 score :  0.7994220579111561


In [11]:
model=Lasso(alpha=0.00538)
model.fit(X_train,Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

In [12]:
print("R2 score on training set : ", model.score(X_train, Y_train))
print("R2 score on test set : ", model.score(X_test, Y_test))
print("MAPE score on training set : ", mean_absolute_percentage_error(Y_train, Y_train_pred))
print("MAPE score on test set : ", mean_absolute_percentage_error(Y_test, Y_test_pred))

R2 score on training set :  0.7976407594109435
R2 score on test set :  0.8044082000293653
MAPE score on training set :  0.35182544103407776
MAPE score on test set :  0.3422940041185154


In [13]:
#column_names = []
#for name, pipeline, features_list in preprocessor.transformers_:
#    if name == 'num':
#        features = features_list
#    elif name == 'cat':
#        features = pipeline.named_steps['encoder'].get_feature_names_out()
#    else:
#        features = pipeline.named_steps['ordi_encoder'].get_feature_names_out()
#    column_names.extend(features)


In [14]:
#coefs = pd.DataFrame(index = column_names, data = model.coef_.transpose(), columns=["coefficients"])
#coefs

In [15]:
#feature_importance = abs(coefs).sort_values(by = 'coefficients')
#feature_importance

In [26]:
classifier = RandomForestClassifier(n_jobs=4)

params = {
    'max_depth': [9, 10, 11, 12, 13],
    'min_samples_leaf': [1, 2, 5],
    'min_samples_split': [1, 2, 3],
    'n_estimators': [90, 100, 110, 120, 130, 140]
}
model2 = GridSearchCV(classifier, param_grid = params, cv = 3)
model2.fit(X_train, Y_train)

print("Best hyperparameters : ", model2.best_params_)
print("Best validation accuracy : ", model2.best_score_)

270 fits failed out of a total of 810.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/Users/lamothemarjory/opt/anaconda3/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 436, in _process_worker
    r = call_item()
  File "/Users/lamothemarjory/opt/anaconda3/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 288, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "/Users/lamothemarjory/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 595, in __call__
    return self.func(*args, **kwargs)
  File "/

Best hyperparameters :  {'max_depth': 13, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
Best validation accuracy :  0.6055403823644167


In [27]:
Y_train_pred2 = model2.predict(X_train)
Y_train_proba2 = model2.predict_proba(X_train)
Y_test_pred2 = model2.predict(X_test)
Y_test_proba2 = model2.predict_proba(X_test)

In [28]:
print("accuracy on training set : ", accuracy_score(Y_train, Y_train_pred2))
print("accuracy on test set : ", accuracy_score(Y_test, Y_test_pred2))
print()

print("f1-score on training set : ", f1_score(Y_train, Y_train_pred2, average='weighted'))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred2, average='weighted'))
print()

accuracy on training set :  0.94914813369749
accuracy on test set :  0.6068642745709828

f1-score on training set :  0.9487866555268218
f1-score on test set :  0.5807057713339199

