In [1]:
import pandas as pd

In [2]:
def aggregate_data_by_dtype(agg_dict, data: pd.DataFrame = None, freq='1D', flatten=True) -> pd.DataFrame:   
    """
    Agrège les données d'un DataFrame selon une fréquence spécifiée et un dictionnaire d'agrégations
    basé sur le type de données.

    Parameters:
    - data (pd.DataFrame): Le DataFrame contenant les données temporelles.
    - freq (str): La fréquence d'agrégation (e.g., 'D' pour journalier, 'H' pour horaire, etc.).
    - agg_dict (dict): Dictionnaire où chaque clé est un type de donnée (e.g., 'float', 'int') et la valeur est une liste de fonctions d'agrégation.

    Returns:
    - pd.DataFrame: Le DataFrame agrégé avec un MultiIndex pour les colonnes.
    """
    
    # Vérifier que le DataFrame est fourni
    if data is None:
        data = self.data.copy()

    # Vérifier que le DataFrame a un index de type DateTime
    if not isinstance(data.index, pd.DatetimeIndex):
        raise ValueError("Le DataFrame doit avoir un index de type DateTime.")

    # Créer un dictionnaire pour stocker les agrégations par colonne
    column_agg_dict = {}

    # Pour chaque type dans agg_dict, on applique les fonctions d'agrégation aux colonnes correspondantes
    for dtype, funcs in agg_dict.items():
        matching_columns = data.select_dtypes(include=[dtype]).columns
        for col in matching_columns:
            column_agg_dict[col] = funcs

    # Fonction d'application unifiée pour gérer les différentes fonctions d'agrégation
    def unified_apply_func(series, funcs):
        """
        Applique une liste de fonctions à une série et retourne une Series ou un DataFrame.
        """
        results = {}
        for func in funcs:
            # self.logger.info(f"Applying {func} to {series.name}...")
            if callable(func):
                # print("callable: ", func)
                result = func(series)
                # print(result)
                if isinstance(result, pd.Series):
                    # print("isinstance: series")
                    # Renommer les résultats de la série pour les inclure dans le MultiIndex
                    results.update(result.rename(lambda x: f"{func.__name__}__{x}"))
                else:
                    results[func.__name__] = result
            else:
                # print("not callable: ", func)
                result = getattr(series, func)()
                # print(result)
                results[func] = result
        return pd.Series(results)

    # Appliquer les agrégations à chaque groupe
    def apply_aggregations(group):
        aggregated = {}
        for col, funcs in column_agg_dict.items():
            # print(group[col])
            # self.logger.info(f"Aggregating column {col}...")
            aggregated[col] = unified_apply_func(group[col], funcs)
        
            concat = pd.concat(aggregated, axis=0)
        return concat
    
    grouper = pd.Grouper(freq=freq)
    grouped_data = data.groupby(grouper)

    # Appliquer les agrégations
    aggregated_df = grouped_data.apply(apply_aggregations)

    
    if not isinstance(aggregated_df, pd.DataFrame):
        aggregated_df = aggregated_df.to_frame().unstack(level=[1, 2]).droplevel(0, axis=1)

    if flatten:
        aggregated_df.columns = ['__'.join(col).strip() for col in aggregated_df.columns.values]

    return aggregated_df

In [3]:
df = pd.read_feather('/home/maxime/Documents/WORKSPACES/forecasting_models/data.feather')
df

Unnamed: 0_level_0,O3_FR26005,O3_FR26010,PM10_FR26005,PM10_FR26010,PM10_FR26014,PM10_FR26094,PM25_FR26005,PM25_FR26094,NO2_FR26005,NO2_FR26010,...,nb_accidents_J-2,nb_accidents_J-3,nb_accidents_J-4,nb_accidents_J-5,nb_accidents_J-6,nb_accidents_J-7,nb_accidents_mean_7J,nb_accidents_std_7J,nb_accidents_mean_14J,nb_accidents_std_14J
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01,36.0,37.0,12.0,15.0,12.0,19.0,6.6,6.8,16.0,6.9,...,,,,,,,,,,
2019-01-02,62.0,69.0,15.0,15.0,10.0,19.0,4.6,6.8,11.0,6.9,...,,,,,,,,,,
2019-01-03,51.0,63.0,20.0,15.0,16.0,19.0,7.7,6.8,15.0,6.9,...,0.0,,,,,,,,,
2019-01-04,42.0,56.0,19.0,15.0,16.0,19.0,12.0,6.8,20.0,6.9,...,0.0,0.0,,,,,,,,
2019-01-05,40.0,44.0,17.0,15.0,15.0,19.0,11.0,6.8,14.0,6.9,...,0.0,0.0,0.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-27,22.0,29.0,21.0,15.0,19.0,18.0,17.0,8.0,20.0,6.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-12-28,41.0,42.0,14.0,15.0,14.0,15.0,9.5,12.0,17.0,6.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-12-29,48.0,49.0,12.0,15.0,12.0,11.0,8.0,7.6,10.0,6.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-12-30,36.0,31.0,15.0,15.0,14.0,13.0,8.8,11.0,13.0,6.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
def relative_frequency(series: pd.Series) -> pd.Series:
    """
    Calcule la fréquence relative (proportion) de chaque catégorie dans une série catégorielle.
    
    Parameters:
    - series (pd.Series): La série catégorielle ou de type objet à analyser.
    
    Returns:
    - pd.Series: Fréquences relatives de chaque catégorie.
    """
    return series.value_counts(normalize=True) * 100

In [5]:
aggregate_data_by_dtype(agg_dict={'number': ['mean', 'min', 'max', 'std'], 'category': [relative_frequency]}, data=df, freq='2YE', flatten=True)

Unnamed: 0_level_0,O3_FR26005__mean,O3_FR26005__min,O3_FR26005__max,O3_FR26005__std,O3_FR26010__mean,O3_FR26010__min,O3_FR26010__max,O3_FR26010__std,PM10_FR26005__mean,PM10_FR26005__min,...,ramadan_J-3__relative_frequency__0,ramadan_J-3__relative_frequency__1,ramadan_J-4__relative_frequency__0,ramadan_J-4__relative_frequency__1,ramadan_J-5__relative_frequency__0,ramadan_J-5__relative_frequency__1,ramadan_J-6__relative_frequency__0,ramadan_J-6__relative_frequency__1,ramadan_J-7__relative_frequency__0,ramadan_J-7__relative_frequency__1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-31,59.767123,4.0,122.0,24.778536,63.624658,4.0,131.0,23.212917,16.290411,0.0,...,91.712707,8.287293,91.689751,8.310249,91.666667,8.333333,91.643454,8.356546,91.620112,8.379888
2021-12-31,54.220246,0.0,115.0,23.470983,60.153215,5.0,125.0,23.803443,15.742134,0.0,...,91.792066,8.207934,91.792066,8.207934,91.792066,8.207934,91.792066,8.207934,91.792066,8.207934
2023-12-31,58.317123,6.0,125.0,22.910396,64.056164,7.0,140.0,23.607226,14.935616,0.0,...,91.780822,8.219178,91.780822,8.219178,91.780822,8.219178,91.780822,8.219178,91.780822,8.219178


# Pistes

utiliser la méthode resample du dataframe quand c'est possible (je crois qu'on ne pouvait pas utiliser de fonction d'aggrégation custom mais pas sur)

Appliquer les fonctions natives pandas sur toute la colonne et non pas sur chaque groupe

Modifier relative frequency pour qu'elle puisse fonctionner avec des groupes