##### Importation des librairies

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from config import Config, Constant

In [2]:
df=pd.read_csv(str(Config.DATASET_DIR) + '/' + str(Config.DATA_1DEK_EXTR))

In [3]:
df.head()

Unnamed: 0,Station,Year,Month,Decade,v_wind_975,u_wind_700,u_wind_100,eau_precipitable,t_point_rosee,h_vol_sol_wat,anom_lef_dek,anom_nino_dek,Date,Label Secheresse,Saison_Pluie
0,Bobo_Dioulasso,1961.0,1.0,1.0,-2.477929,-6.607575,16.585018,7.668101e-08,274.869892,0.177988,0.366058,-0.024109,1961-01-01,0.0,False
1,Bogande,1961.0,1.0,1.0,-2.902994,-3.065768,17.778388,6.194008e-08,269.371269,0.080786,0.366058,-0.024109,,,
2,Boromo,1961.0,1.0,1.0,-3.516664,-5.668901,16.643275,3.728255e-08,274.435394,0.168747,0.366058,-0.024109,1961-01-01,0.0,False
3,Dedougou,1961.0,1.0,1.0,-3.342931,-4.970608,18.23231,8.872772e-09,274.004411,0.124276,0.366058,-0.024109,1961-01-01,0.0,False
4,Dori,1961.0,1.0,1.0,-3.13265,-1.264646,19.343432,-5.116298e-08,267.627423,0.18111,0.366058,-0.024109,1961-01-01,0.0,False


#### Creation des fonctions utiles

In [4]:
# Définition de la fonction to_numeric_with_nan
def to_numeric_with_nan(value):
    try:
        return int(value)
    except (ValueError, TypeError):
        return pd.NA  # Retourne une valeur manquante

In [5]:
def label_change(value):
    if pd.isna(value):
        return pd.NA
    elif value == 0.0:
        return '0'
    elif value == 1.0:
        return '1'
    else:
        return value  # Si la valeur est différente de NaN, 0.0 et 1.0, la renvoyer telle quelle

In [6]:
def load_data_dek(file_path):
    df = pd.read_csv(file_path)    
    # Supprimer les lignes contenant des valeurs NaN
    df.dropna(axis=0, inplace=True)   
    # Convertir les colonnes Year, Month et Decade en numérique (si nécessaire)
    df[['Year', 'Month', 'Decade']] = df[['Year', 'Month', 'Decade']].applymap(to_numeric_with_nan)    
    # Appliquer la fonction label_change à la colonne 'Label Secheresse'
    df['Label Secheresse'] = df['Label Secheresse'].apply(label_change)  
    # Encodage des valeurs qualitatives
    ordinal_columns = ['Station', 'Saison_Pluie']  # Liste des colonnes catégorielles ordinales
    encoder = LabelEncoder()    
    for col in ordinal_columns:
        df[col] = encoder.fit_transform(df[col])   
    # Décaler la sécheresse de deux mois en avant pour la prédiction
    df['Secheresse_future'] = df['Label Secheresse'].shift(2)
    df.dropna(axis=0, inplace=True)  
    return df


In [7]:
# Prétraitement des données et entraînement du modèle
def train_and_evaluate_model(X_train, X_test, y_train, y_test, model, param_grid):
    # Prétraitement des données
    imputer = SimpleImputer(strategy='mean')
    scaler = StandardScaler()

    # Créer le sélecteur de caractéristiques basé sur l'importance des fonctionnalités
    feature_selector = SelectFromModel(model, threshold='median')

    # Créer le pipeline de prétraitement, de sélection de caractéristiques et de modèle
    pipeline = Pipeline([
        ('imputer', imputer),
        ('scaler', scaler),
        ('feature_selector', feature_selector),
        ('model', model)
    ])

    # Créer le modèle GridSearchCV
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    # Entraîner le modèle GridSearchCV
    grid_search.fit(X_train, y_train)

    # Obtenir les meilleures valeurs d'hyperparamètres
    best_params = grid_search.best_params_

    # Obtenir le modèle avec les meilleurs hyperparamètres
    best_model = grid_search.best_estimator_

    # Prédire la sécheresse deux mois à l'avance sur l'ensemble de test
    y_pred = best_model.predict(X_test)

    # Évaluer le modèle
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return best_params, best_model, accuracy, report

In [8]:
features=['Station','v_wind_975','u_wind_700','u_wind_100','eau_precipitable','t_point_rosee','h_vol_sol_wat','anom_lef_dek','anom_nino_dek','Saison_Pluie']

In [9]:
# Définir les modèles et leurs grilles d'hyperparamètres respectives
models = [
    {
        'name': 'RandomForest',
        'model': RandomForestClassifier(random_state=42),
        'param_grid': {
            'model__n_estimators': [50, 100, 150],
            'model__max_depth': [None, 10, 20, 30]
        }
    },
    {
        'name': 'SVM',
        'model': SVC(random_state=42),
        'param_grid': {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['linear', 'rbf','poly'],
            'model__gamma': [0.1, 1, 'scale','auto']
        }
    }
    # Ajoutez d'autres modèles avec leurs paramètres ici
]

### ENTRAINEMENT SUR LES DONNNEES DECADAIRES

In [10]:

data_ext_1dek=load_data_dek(str(Config.DATASET_DIR) + '/' + str(Config.DATA_1DEK_EXTR))

In [11]:
data_ext_1dek.head()

Unnamed: 0,Station,Year,Month,Decade,v_wind_975,u_wind_700,u_wind_100,eau_precipitable,t_point_rosee,h_vol_sol_wat,anom_lef_dek,anom_nino_dek,Date,Label Secheresse,Saison_Pluie,Secheresse_future
3,3,1961,1,1,-3.342931,-4.970608,18.23231,8.872772e-09,274.004411,0.124276,0.366058,-0.024109,1961-01-01,0,0,0
4,4,1961,1,1,-3.13265,-1.264646,19.343432,-5.116298e-08,267.627423,0.18111,0.366058,-0.024109,1961-01-01,0,0,0
5,5,1961,1,1,-2.70915,-4.116544,16.559061,-2.382527e-08,270.621893,0.117374,0.366058,-0.024109,1961-01-01,0,0,0
6,6,1961,1,1,-2.203496,-7.391964,14.800494,4.13028e-08,275.489078,0.138957,0.366058,-0.024109,1961-01-01,0,0,0
7,7,1961,1,1,-2.171745,-4.48842,17.522954,-1.847897e-09,273.942325,0.241372,0.366058,-0.024109,1961-01-01,0,0,0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(data_ext_1dek[features], data_ext_1dek['Secheresse_future'], test_size=0.2, random_state=42)


In [13]:
X_train.head()

Unnamed: 0,Station,v_wind_975,u_wind_700,u_wind_100,eau_precipitable,t_point_rosee,h_vol_sol_wat,anom_lef_dek,anom_nino_dek,Saison_Pluie
3797,7,1.503845,-9.608794,-17.582774,-1.109025e-06,294.475428,0.423791,-1.445404,-0.477783,1
4958,8,1.499332,-10.466419,-0.824979,9.354066e-07,293.313949,0.086705,-1.062836,-0.805603,1
642,2,1.111487,-10.970848,-2.461172,-2.91842e-07,295.336061,0.274671,-0.487091,-0.340759,1
12468,8,1.077647,-10.92085,-10.579602,7.989721e-08,295.705655,0.184121,-1.382111,-0.6828,1
5292,2,1.801318,-8.608666,-11.800067,8.260558e-07,295.35209,0.377239,-1.952087,-1.254944,1


In [14]:
for model_info in models:
    print(f'--- {model_info["name"]} ---')
    best_params, best_model, accuracy, report = train_and_evaluate_model(X_train, X_test, y_train, y_test, model_info['model'], model_info['param_grid'])
    print(f'Best Hyperparameters: {best_params}')
    print(f'Accuracy: {accuracy}')
    print(report)

--- RandomForest ---
Best Hyperparameters: {'model__max_depth': None, 'model__n_estimators': 150}
Accuracy: 0.9727414330218068
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      3748
           1       0.00      0.00      0.00       104

    accuracy                           0.97      3852
   macro avg       0.49      0.50      0.49      3852
weighted avg       0.95      0.97      0.96      3852

--- SVM ---


ValueError: Invalid parameter 'C' for estimator Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('feature_selector',
                 SelectFromModel(estimator=SVC(random_state=42),
                                 threshold='median')),
                ('model', SVC(random_state=42))]). Valid parameters are: ['memory', 'steps', 'verbose'].