In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Exemple de DataFrame : une seule catégorie par jour
data = {
    'date': ['2023-09-01', '2023-09-02', '2023-09-03', '2023-09-04', '2023-09-05'],
    'category': ['A', 'B', 'A', 'C', 'B']
}

df = pd.DataFrame(data)

# Conversion de la colonne 'date' en datetime
df['date'] = pd.to_datetime(df['date'])

# Mapping des catégories à des valeurs numériques (pour les afficher sur l'axe Y)
categories = df['category'].unique()
category_mapping = {category: idx for idx, category in enumerate(categories)}

# Remplacement des catégories par des valeurs numériques pour l'affichage
df['category_numeric'] = df['category'].map(category_mapping)

# Tracé du graphique avec les catégories en fonction des dates
plt.scatter(df['date'], df['category_numeric'], c=df['category_numeric'], cmap='Set1', s=100)

# Remplacer les valeurs numériques par les noms de catégories sur l'axe Y
plt.yticks(ticks=list(category_mapping.values()), labels=list(category_mapping.keys()))

plt.xlabel('Date')
plt.ylabel('Category')
plt.title('Affichage des catégories en fonction du temps')
plt.xticks(rotation=45)  # Rotation des dates pour une meilleure lisibilité
plt.grid(True)
plt.show()

In [29]:
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from src.models.loss import weighted_rmse_obj, weighted_rmse
from sklearn.model_selection import GridSearchCV


# Generate random data
np.random.seed(42)
X = pd.DataFrame(np.random.rand(100, 10))
y = pd.DataFrame(np.random.rand(100, 3))

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost regressor
xgb_model = XGBRegressor(objective=weighted_rmse_obj, n_estimators=100, max_depth=5, learning_rate=0.1, early_stopping_rounds=10, eval_metric=weighted_rmse)


multi_output_model = GridSearchCV(estimator=xgb_model, param_grid={'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}, cv=5, n_jobs=-1)

# Wrap it with MultiOutputRegressor
multi_output_model = MultiOutputRegressor(multi_output_model)

# Fit the model
multi_output_model.fit(X_train, y_train, eval_set=[(X_test, y_test[0]),(X_test, y_test[1]),(X_test, y_test[2])])

# Predict on the test data
y_pred = multi_output_model.predict(X_test)

# Calculate Mean Squared Error for each output dimension
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
print(f'MSE for each target: {mse}')

[0]	validation_0-rmse:0.34703	validation_0-weighted_rmse:0.51525	validation_1-rmse:0.28240	validation_1-weighted_rmse:0.42143	validation_2-rmse:0.31237	validation_2-weighted_rmse:0.45971[0]	validation_0-rmse:0.34564	validation_0-weighted_rmse:0.51273	validation_1-rmse:0.28748	validation_1-weighted_rmse:0.42840	validation_2-rmse:0.31246	validation_2-weighted_rmse:0.46065

[0]	validation_0-rmse:0.34780	validation_0-weighted_rmse:0.51741	validation_1-rmse:0.27678	validation_1-weighted_rmse:0.41226	validation_2-rmse:0.30328	validation_2-weighted_rmse:0.44698
[0]	validation_0-rmse:0.34736	validation_0-weighted_rmse:0.51693	validation_1-rmse:0.27739	validation_1-weighted_rmse:0.41341	validation_2-rmse:0.31171	validation_2-weighted_rmse:0.45976
[1]	validation_0-rmse:0.35257	validation_0-weighted_rmse:0.52290	validation_1-rmse:0.27621	validation_1-weighted_rmse:0.41180	validation_2-rmse:0.31439	validation_2-weighted_rmse:0.46143
[1]	validation_0-rmse:0.34813	validation_0-weighted_rmse:0.51506	

array([[0.18513293, 0.54190095, 0.87294584]])

In [26]:
y

Unnamed: 0,0,1,2
0,0.185133,0.541901,0.872946
1,0.732225,0.806561,0.658783
2,0.692277,0.849196,0.249668
3,0.489425,0.221209,0.987668
4,0.944059,0.039427,0.705575
...,...,...,...
95,0.517712,0.087866,0.350627
96,0.033203,0.078578,0.396923
97,0.132716,0.567541,0.689465
98,0.800587,0.200150,0.167483


In [28]:
y[2]

0     0.872946
1     0.658783
2     0.249668
3     0.987668
4     0.705575
        ...   
95    0.350627
96    0.396923
97    0.689465
98    0.167483
99    0.706476
Name: 2, Length: 100, dtype: float64

In [None]:
import pandas as pd

In [3]:
def aggregate_data_by_dtype(agg_dict, data: pd.DataFrame = None, freq='1D', flatten=True) -> pd.DataFrame:   
    """
    Agrège les données d'un DataFrame selon une fréquence spécifiée et un dictionnaire d'agrégations
    basé sur le type de données.

    Parameters:
    - data (pd.DataFrame): Le DataFrame contenant les données temporelles.
    - freq (str): La fréquence d'agrégation (e.g., 'D' pour journalier, 'H' pour horaire, etc.).
    - agg_dict (dict): Dictionnaire où chaque clé est un type de donnée (e.g., 'float', 'int') et la valeur est une liste de fonctions d'agrégation.

    Returns:
    - pd.DataFrame: Le DataFrame agrégé avec un MultiIndex pour les colonnes.
    """
    
    # Vérifier que le DataFrame est fourni
    if data is None:
        data = self.data.copy()

    # Vérifier que le DataFrame a un index de type DateTime
    if not isinstance(data.index, pd.DatetimeIndex):
        raise ValueError("Le DataFrame doit avoir un index de type DateTime.")

    # Créer un dictionnaire pour stocker les agrégations par colonne
    column_agg_dict = {}

    # Pour chaque type dans agg_dict, on applique les fonctions d'agrégation aux colonnes correspondantes
    for dtype, funcs in agg_dict.items():
        matching_columns = data.select_dtypes(include=[dtype]).columns
        for col in matching_columns:
            column_agg_dict[col] = funcs

    # Fonction d'application unifiée pour gérer les différentes fonctions d'agrégation
    def unified_apply_func(series, funcs):
        """
        Applique une liste de fonctions à une série et retourne une Series ou un DataFrame.
        """
        results = {}
        for func in funcs:
            # self.logger.info(f"Applying {func} to {series.name}...")
            if callable(func):
                # print("callable: ", func)
                result = func(series)
                # print(result)
                if isinstance(result, pd.Series):
                    # print("isinstance: series")
                    # Renommer les résultats de la série pour les inclure dans le MultiIndex
                    results.update(result.rename(lambda x: f"{func.__name__}__{x}"))
                else:
                    results[func.__name__] = result
            else:
                # print("not callable: ", func)
                result = getattr(series, func)()
                # print(result)
                results[func] = result
        return pd.Series(results)

    # Appliquer les agrégations à chaque groupe
    def apply_aggregations(group):
        aggregated = {}
        for col, funcs in column_agg_dict.items():
            # print(group[col])
            # self.logger.info(f"Aggregating column {col}...")
            aggregated[col] = unified_apply_func(group[col], funcs)
        
            concat = pd.concat(aggregated, axis=0)
        return concat
    
    grouper = pd.Grouper(freq=freq)
    grouped_data = data.groupby(grouper)

    # Appliquer les agrégations
    aggregated_df = grouped_data.apply(apply_aggregations)

    
    if not isinstance(aggregated_df, pd.DataFrame):
        aggregated_df = aggregated_df.to_frame().unstack(level=[1, 2]).droplevel(0, axis=1)

    if flatten:
        aggregated_df.columns = ['__'.join(col).strip() for col in aggregated_df.columns.values]

    return aggregated_df

In [4]:
df = pd.read_feather('/home/maxime/Documents/WORKSPACES/forecasting_models/data.feather')
df

In [7]:
def relative_frequency(series: pd.Series) -> pd.Series:
    """
    Calcule la fréquence relative (proportion) de chaque catégorie dans une série catégorielle.
    
    Parameters:
    - series (pd.Series): La série catégorielle ou de type objet à analyser.
    
    Returns:
    - pd.Series: Fréquences relatives de chaque catégorie.
    """
    return series.value_counts(normalize=True) * 100

In [9]:
aggregate_data_by_dtype(agg_dict={'number': ['mean', 'min', 'max', 'std'], 'category': [relative_frequency]}, data=df, freq='2YE', flatten=True)

Unnamed: 0_level_0,O3_FR26005__mean,O3_FR26005__min,O3_FR26005__max,O3_FR26005__std,O3_FR26010__mean,O3_FR26010__min,O3_FR26010__max,O3_FR26010__std,PM10_FR26005__mean,PM10_FR26005__min,...,ramadan_J-3__relative_frequency__0,ramadan_J-3__relative_frequency__1,ramadan_J-4__relative_frequency__0,ramadan_J-4__relative_frequency__1,ramadan_J-5__relative_frequency__0,ramadan_J-5__relative_frequency__1,ramadan_J-6__relative_frequency__0,ramadan_J-6__relative_frequency__1,ramadan_J-7__relative_frequency__0,ramadan_J-7__relative_frequency__1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-31,59.767123,4.0,122.0,24.778536,63.624658,4.0,131.0,23.212917,16.290411,0.0,...,91.712707,8.287293,91.689751,8.310249,91.666667,8.333333,91.643454,8.356546,91.620112,8.379888
2021-12-31,54.220246,0.0,115.0,23.470983,60.153215,5.0,125.0,23.803443,15.742134,0.0,...,91.792066,8.207934,91.792066,8.207934,91.792066,8.207934,91.792066,8.207934,91.792066,8.207934
2023-12-31,58.317123,6.0,125.0,22.910396,64.056164,7.0,140.0,23.607226,14.935616,0.0,...,91.780822,8.219178,91.780822,8.219178,91.780822,8.219178,91.780822,8.219178,91.780822,8.219178
