### Import des Bibliothéques
---

In [None]:
import pandas as pd
import numpy as np
import pickle
import os
import boto3
import pickle # moyen le plus simple et direct pour sauvegarder et recharger un modèle Python entier.
import statsmodels.api as sm

from dotenv import load_dotenv
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error



### Chargement fichier df_reduced.csv du S3 
---

In [80]:
load_dotenv("secrets.env")

AWS_KEY = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET = os.getenv("AWS_SECRET_ACCESS_KEY")
BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
NEON_CONN = os.getenv("NEON_CONN")

s3 = boto3.client(
    "s3",
    aws_access_key_id=AWS_KEY,
    aws_secret_access_key=AWS_SECRET
)

# chargement fichier du S3
s3.download_file(BUCKET_NAME, "df_reduced.csv", "df_reducedS3.csv")

# Lecture des fichiers 
df_reducedS3 = pd.read_csv("df_reducedS3.csv")

display(df_reducedS3.head())



Unnamed: 0,Département,année,densité,pop_globale,tranche_age_0-24,tranche_age_25-59,tranche_age_60+,csp1_agriculteurs,csp2_artisans_commerçant_chef_entreprises,csp3_cadres_professions_intellectuelles,...,nb_salaries_secteur_industrie,nb_salaries_secteur_service,nbre_entreprises_agricole,nbre_entreprises_industrie,nbre_entreprises_service,Total_autres_dechets,Déblais_gravats,Déchets_verts,Encombrants,Matériaux_recyclables
0,Ain,2009,101.8,588857,188068,281744,119045,4017,17226,37333,...,68343,141408,3092,6263,15605,4803,37593,45013,34729,25405
1,Aisne,2009,73.4,539547,170369,249910,119268,5250,11668,19234,...,44128,129961,3720,3913,11215,1180,24380,24539,31593,10542
2,Allier,2009,46.6,342559,86913,152570,103076,6019,9743,12045,...,31537,92422,3325,3011,8474,1867,14374,18612,11932,8564
3,Alpes-de-Haute-Provence,2009,23.0,159510,43500,70222,45788,2136,6704,7210,...,10803,44575,1473,2096,6415,803,11228,8280,9006,7327
4,Hautes-Alpes,2009,24.5,135883,37235,62716,35932,2214,5658,6070,...,8607,46547,1333,1869,5980,1048,8509,6121,7938,10344


### Modéle OLS
---

In [81]:

# --- Targets à prédire ---
targets = [
    'Total_autres_dechets',
    'Déblais_gravats',
    'Déchets_verts',
    'Encombrants',
    'Matériaux_recyclables'
]

# --- DataFrame pour stocker les résultats ---
df_results = pd.DataFrame(columns=['cible','R2_train','R2_test','MAE','RMSE'])

# --- Boucle sur chaque target ---
for target in targets:
    print(f"--- Traitement de {target} ---")
    
    X = df_reducedS3.drop(columns=[target])
    y = df_reducedS3[target]
    
    # Diviser train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Identifier colonnes numériques et catégorielles
    numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_columns = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

    # --- ColumnTransformer pour standardiser et encoder ---
    preprocessor = ColumnTransformer(transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_columns)
    ])
    
    # Encoder train et test (converti en dense)
    X_train_enc = preprocessor.fit_transform(X_train)
    X_test_enc = preprocessor.transform(X_test)
    
    # Ajouter constante pour OLS
    X_train_enc_const = sm.add_constant(X_train_enc)
    X_test_enc_const = sm.add_constant(X_test_enc, has_constant='add')
    
    # --- Modèle OLS ---
    ols_model = sm.OLS(y_train, X_train_enc_const).fit()
    
    # Prédictions
    y_train_pred = ols_model.predict(X_train_enc_const)
    y_test_pred = ols_model.predict(X_test_enc_const)
    
    # Calcul des métriques
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)
    mae = mean_absolute_error(y_test, y_test_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mape = round(mean_absolute_percentage_error(y_test, y_test_pred),4) * 100  # en %
    
    # Stocker les résultats
    df_results = pd.concat([df_results, pd.DataFrame([{
        'cible': target,
        'R2_train': r2_train,
        'R2_test': r2_test,
        'MAE': mae,
        'RMSE': rmse,
        'MAPE': mape 
    }])], ignore_index=True)
    
    # Sauvegarder le modèle
    with open(f'model_ols_{target}.pkl', 'wb') as f:
        pickle.dump(ols_model, f)

df_results.head()

--- Traitement de Total_autres_dechets ---
--- Traitement de Déblais_gravats ---
--- Traitement de Déchets_verts ---


  df_results = pd.concat([df_results, pd.DataFrame([{


--- Traitement de Encombrants ---
--- Traitement de Matériaux_recyclables ---


Unnamed: 0,cible,R2_train,R2_test,MAE,RMSE,MAPE
0,Total_autres_dechets,0.852505,0.763521,2945.929513,4790.515765,49.37
1,Déblais_gravats,0.973181,0.974606,4248.225994,6408.328741,19.43
2,Déchets_verts,0.987394,0.933627,4522.437305,9102.058983,14.84
3,Encombrants,0.970889,0.901533,5023.831558,8634.605841,13.04
4,Matériaux_recyclables,0.982392,0.963132,2166.528217,2995.279314,13.58


In [82]:
ols_model.summary()


0,1,2,3
Dep. Variable:,Matériaux_recyclables,R-squared:,0.982
Model:,OLS,Adj. R-squared:,0.977
Method:,Least Squares,F-statistic:,199.8
Date:,"Thu, 09 Oct 2025",Prob (F-statistic):,1.64e-309
Time:,16:44:35,Log-Likelihood:,-4861.3
No. Observations:,537,AIC:,9959.0
Df Residuals:,419,BIC:,10460.0
Df Model:,117,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.225e+04,4726.586,6.823,0.000,2.3e+04,4.15e+04
x1,-177.8087,249.139,-0.714,0.476,-667.526,311.909
x2,1.072e+04,1.17e+04,0.919,0.359,-1.22e+04,3.36e+04
x3,-3539.8696,1616.995,-2.189,0.029,-6718.302,-361.437
x4,4596.5505,1.26e+04,0.365,0.715,-2.02e+04,2.94e+04
x5,-1.303e+04,1.05e+04,-1.236,0.217,-3.38e+04,7696.737
x6,5585.9104,6619.354,0.844,0.399,-7425.368,1.86e+04
x7,-1952.7729,1980.619,-0.986,0.325,-5845.960,1940.415
x8,-3228.4779,3158.757,-1.022,0.307,-9437.464,2980.508

0,1,2,3
Omnibus:,61.063,Durbin-Watson:,2.116
Prob(Omnibus):,0.0,Jarque-Bera (JB):,235.358
Skew:,0.441,Prob(JB):,7.81e-52
Kurtosis:,6.121,Cond. No.,5.72e+16
