Ce notebook vise à mettre en commun tous les dataframes crées pour constituer le dataset qui va servir pour faire apprendre nos modèles

In [1]:
import pandas as pd
from config import Config
import os 

In [2]:
# Définition de la fonction to_numeric_with_nan
def to_numeric_with_nan(value):
    try:
        return int(value)
    except (ValueError, TypeError):
        return pd.NA  # Retourne une valeur manquante

In [3]:
def label_change(value):
    if pd.isna(value):
        return pd.NA
    elif value == 0.0:
        return '0'
    elif value == 1.0:
        return '1'
    else:
        return value  # Si la valeur est différente de NaN, 0.0 et 1.0, la renvoyer telle quelle

#### Fonction calcul moyenne roulante

In [4]:
def calculate_rolling_average(df_data, num_months):
    df=df_data.copy()
    # Triez le DataFrame par 'Station', 'Year' et 'Month'
    df = df.sort_values(['Station', 'Year', 'Month'])
    columns_to_average=['v_wind_925','u_wind_850', 'u_wind_700', 'u_wind_200', 'eau_precipitable', 't_point_rosee', 'h_vol_sol_wat', 'anom_lef_mois', 'anom_nino_mois']
    # Utilisez la méthode rolling() pour calculer la moyenne roulante sur le nombre de mois spécifié
    rolling_average = df.groupby('Station')[columns_to_average].rolling(window=num_months).mean()
    # Réinitialisez l'index
    rolling_average = rolling_average.reset_index()
    rolling_average = rolling_average.set_index('level_1')
    for col in columns_to_average:
        df[col] = rolling_average[col]
    return df

#### Creation de dossier de sauvegarde dataframe

In [5]:
dataset_dir = (str(Config.DATASET_DIR))
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

##### Lecture de tous les fichiers

#### VARIABLES

##### DECADAIRES

In [6]:
variables_dek=pd.read_csv(os.path.join(Config.FILES_TRAITED_PATH,Config.ERA_FUSION_DEK))

In [7]:
variables_dek.head()

Unnamed: 0,Station,Year,Month,Decade,v_wind_925,u_wind_850,u_wind_700,u_wind_200,eau_precipitable,t_point_rosee,h_vol_sol_wat,anom_lef_dek,anom_nino_dek
0,Bobo_Dioulasso,1961,1,1,-5.685185,-4.901044,-6.50731,24.359285,5.872143e-09,274.505384,0.177588,0.365944,-0.024139
1,Bogande,1961,1,1,-4.457056,-5.318282,-3.077982,26.031411,7.469295e-08,268.932499,0.080787,0.365944,-0.024139
2,Boromo,1961,1,1,-5.117354,-4.519135,-5.647762,24.738232,2.742668e-08,274.025898,0.168335,0.365944,-0.024139
3,Dedougou,1961,1,1,-4.927709,-4.962412,-4.930288,26.618781,4.440598e-08,273.528833,0.123838,0.365944,-0.024139
4,Dori,1961,1,1,-3.655243,-6.004024,-1.298756,28.527198,-7.225495e-08,267.173088,0.18111,0.365944,-0.024139


##### MENSUELLE

In [8]:
variables_mon=pd.read_csv(os.path.join(Config.FILES_TRAITED_PATH,Config.ERA_FUSION_MON))

In [9]:
variables_mon.head()

Unnamed: 0,Station,Year,Month,v_wind_925,u_wind_850,u_wind_700,u_wind_200,eau_precipitable,t_point_rosee,h_vol_sol_wat,anom_lef_mois,anom_nino_mois
0,Bobo_Dioulasso,1961,1,-6.344612,-2.220861,-1.699389,28.423512,-2.447008e-07,274.258404,0.178051,0.432493,0.100473
1,Bogande,1961,1,-5.767704,-2.073078,1.304748,30.424754,-6.035625e-08,269.08543,0.081025,0.432493,0.100473
2,Boromo,1961,1,-6.010698,-1.756913,-0.93406,29.054192,-9.627294e-08,273.843337,0.166356,0.432493,0.100473
3,Dedougou,1961,1,-5.853977,-2.094841,-0.325521,30.879742,-2.844164e-07,273.860768,0.122703,0.432493,0.100473
4,Dori,1961,1,-5.160844,-2.311904,2.618656,32.211017,3.34537e-07,268.276131,0.183776,0.432493,0.100473


variables_3mon=variables_mon

### B- DATAFRAMES DES SECHERESSES

#### B-I SPEI DECACE

#### Sévere à Extreme

In [10]:
sech_ext_sev_1dek=pd.read_csv(os.path.join(Config.FILES_TRAITED_PATH,Config.DATA_1DEK))

In [11]:
sech_ext_sev_1dek.head()

Unnamed: 0,Date,Station,Label Secheresse,Year,Month,Decade,Saison_Pluie
0,1961-01-01,Bobo_Dioulasso,0.0,1961.0,1.0,1.0,False
1,1961-01-01,Bogande,,1961.0,1.0,1.0,False
2,1961-01-01,Boromo,1.0,1961.0,1.0,1.0,False
3,1961-01-01,Dedougou,0.0,1961.0,1.0,1.0,False
4,1961-01-01,Dori,0.0,1961.0,1.0,1.0,False


In [12]:
invalid_rows = sech_ext_sev_1dek.isin(['--']).any(axis=1)
rows_with_invalid_data = sech_ext_sev_1dek[invalid_rows]
rows_with_invalid_data

Unnamed: 0,Date,Station,Label Secheresse,Year,Month,Decade,Saison_Pluie


In [13]:
sech_ext_sev_1dek[['Year', 'Month', 'Decade']] = sech_ext_sev_1dek[['Year', 'Month', 'Decade']].applymap(to_numeric_with_nan)
sech_ext_sev_1dek['Label Secheresse'] = sech_ext_sev_1dek['Label Secheresse'].apply(label_change)
sech_ext_sev_1dek.head()



Unnamed: 0,Date,Station,Label Secheresse,Year,Month,Decade,Saison_Pluie
0,1961-01-01,Bobo_Dioulasso,0.0,1961,1,1,False
1,1961-01-01,Bogande,,1961,1,1,False
2,1961-01-01,Boromo,1.0,1961,1,1,False
3,1961-01-01,Dedougou,0.0,1961,1,1,False
4,1961-01-01,Dori,0.0,1961,1,1,False


##### Merger les dataframes variables decadaires et label secheresse decadaires

In [14]:
data_ext_sev_1dek=variables_dek.merge(sech_ext_sev_1dek, on=['Station', 'Year', 'Month', 'Decade'], how='outer')
data_ext_sev_1dek.head()
#Sauvegarde 
data_ext_sev_1dek.to_csv(os.path.join(Config.DATASET_DIR,Config.DATA_1DEK),index=False)

##### B- II SPEI 1MON

##### Sévere à Extreme

In [15]:
sech_ext_sev_1mon=pd.read_csv(str(Config.FILES_TRAITED_PATH) + '/' + str(Config.DATA_1MON))

In [16]:
sech_ext_sev_1mon[['Year', 'Month']] = sech_ext_sev_1mon[['Year', 'Month']].applymap(to_numeric_with_nan)
sech_ext_sev_1mon['Label Secheresse'] = sech_ext_sev_1mon['Label Secheresse'].apply(label_change)
sech_ext_sev_1mon.head()



Unnamed: 0,Station,Date,Label Secheresse,Saison_Pluie,Year,Month
0,Bobo_Dioulasso,1961-01-01,0.0,False,1961,1
1,Bogande,1961-01-01,,False,1961,1
2,Boromo,1961-01-01,1.0,False,1961,1
3,Dedougou,1961-01-01,0.0,False,1961,1
4,Dori,1961-01-01,1.0,False,1961,1


##### Merger les dataframes variables mensuelles et label secheresse mensuelles

In [17]:
data_ext_sev_1mon=variables_mon.merge(sech_ext_sev_1mon, on=['Station', 'Year', 'Month'], how='outer')
data_ext_sev_1mon.head()
#Sauvegarde
data_ext_sev_1mon.to_csv(os.path.join(Config.DATASET_DIR,Config.DATA_1MON),index=False)

##### B-III SPEI 3MON

##### Sévere à Extreme

In [18]:
sech_ext_sev_3mon=pd.read_csv(os.path.join(Config.FILES_TRAITED_PATH,Config.DATA_3MON))


In [19]:
sech_ext_sev_3mon[['Year', 'Month']] = sech_ext_sev_3mon[['Year', 'Month']].applymap(to_numeric_with_nan)
sech_ext_sev_3mon['Label Secheresse'] = sech_ext_sev_3mon['Label Secheresse'].apply(label_change)
sech_ext_sev_3mon.head()



Unnamed: 0,Station,Date,Label Secheresse,Saison_Pluie,Year,Month
0,Bobo_Dioulasso,1961-01-01,,False,1961,1
1,Bogande,1961-01-01,,False,1961,1
2,Boromo,1961-01-01,,False,1961,1
3,Dedougou,1961-01-01,,False,1961,1
4,Dori,1961-01-01,,False,1961,1


##### Merger les dataframes variables mensuelles et label secheresse mensuelles

In [20]:
sech_ext_sev_3mon

Unnamed: 0,Station,Date,Label Secheresse,Saison_Pluie,Year,Month
0,Bobo_Dioulasso,1961-01-01,,False,1961,1
1,Bogande,1961-01-01,,False,1961,1
2,Boromo,1961-01-01,,False,1961,1
3,Dedougou,1961-01-01,,False,1961,1
4,Dori,1961-01-01,,False,1961,1
...,...,...,...,...,...,...
7555,Fada_Ngourma,2023-12-01,1,False,2023,12
7556,Gaoua,2023-12-01,0,False,2023,12
7557,Ouagadougou_aero,2023-12-01,0,False,2023,12
7558,Ouahigouya,2023-12-01,1,False,2023,12


In [21]:
data_ext_sev_3mon=variables_mon.merge(sech_ext_sev_3mon, on=['Station', 'Year', 'Month'], how='outer')
data_ext_sev_3mon=calculate_rolling_average(data_ext_sev_3mon, 3)
#Sauvegarde 
data_ext_sev_3mon.to_csv(os.path.join(Config.DATASET_DIR, Config.DATA_3MON),index=False)

In [22]:
invalid_rows = data_ext_sev_3mon.isin(['--']).any(axis=1)
rows_with_invalid_data = data_ext_sev_3mon[invalid_rows]
rows_with_invalid_data

Unnamed: 0,Station,Year,Month,v_wind_925,u_wind_850,u_wind_700,u_wind_200,eau_precipitable,t_point_rosee,h_vol_sol_wat,anom_lef_mois,anom_nino_mois,Date,Label Secheresse,Saison_Pluie


In [23]:
data_ext_sev_3mon[data_ext_sev_3mon['Station']=='Dori'].head()

Unnamed: 0,Station,Year,Month,v_wind_925,u_wind_850,u_wind_700,u_wind_200,eau_precipitable,t_point_rosee,h_vol_sol_wat,anom_lef_mois,anom_nino_mois,Date,Label Secheresse,Saison_Pluie
3052,Dori,1961,1,,,,,,,,,,1961-01-01,,False
3053,Dori,1961,2,,,,,,,,,,1961-02-01,,False
3054,Dori,1961,3,-3.836905,-4.590344,2.199147,27.445839,1.278016e-07,268.216494,0.178086,1.019275,0.38921,1961-03-01,0.0,True
3055,Dori,1961,4,-2.449641,-4.389103,0.71931,23.365487,-2.621768e-08,271.213495,0.180016,1.437306,0.698621,1961-04-01,0.0,True
3056,Dori,1961,5,-0.436866,-2.219786,-1.743214,18.8063,1.666946e-07,278.273636,0.187888,1.48995,0.797182,1961-05-01,0.0,True


##### IV SPEI 6MON

##### B-IV-1 Severe à Extreme

In [24]:
sech_ext_sev_6mon=pd.read_csv(os.path.join(Config.FILES_TRAITED_PATH,Config.DATA_6MON))

In [25]:
sech_ext_sev_6mon[['Year', 'Month']] = sech_ext_sev_6mon[['Year', 'Month']].applymap(to_numeric_with_nan)
sech_ext_sev_6mon['Label Secheresse'] = sech_ext_sev_6mon['Label Secheresse'].apply(label_change)
sech_ext_sev_6mon.head()



Unnamed: 0,Station,Date,Label Secheresse,Saison_Pluie,Year,Month
0,Bobo_Dioulasso,1961-01-01,,False,1961,1
1,Bogande,1961-01-01,,False,1961,1
2,Boromo,1961-01-01,,False,1961,1
3,Dedougou,1961-01-01,,False,1961,1
4,Dori,1961-01-01,,False,1961,1


##### Merger les dataframes variables mensuelles et label secheresse mensuelles

In [26]:
data_ext_sev_6mon=variables_mon.merge(sech_ext_sev_6mon, on=['Station', 'Year', 'Month'], how='outer')
data_ext_sev_6mon=calculate_rolling_average(data_ext_sev_6mon, 6)
#Sauvegarde 
data_ext_sev_6mon.to_csv(os.path.join(Config.DATASET_DIR,Config.DATA_6MON),index=False)