In [3]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
def combine_datasets(path, substring):
    files = [f for f in os.listdir(path) if substring in f and f.endswith('.csv')]
    
    if not files:
        print(f"No files containing the substring '{substring}' were found.")
        return None
    
    dataframe_list = []
    for file in files:
        file_path = os.path.join(path, file)
        df = pd.read_csv(file_path)
        dataframe_list.append(df)
    
    combined_df = pd.concat(dataframe_list, ignore_index=True)
    return combined_df

path = '../datasets/pos-process'
dataframes_by_source = {}

for dirs, root, files in os.walk(path):
    for file in files:
        if file.endswith('.csv'):
            source = '-' + file.split('-')[2] + '-'
            df = combine_datasets(path, source)
            
            if df is not None:
                key_name = f"{source.strip('-')}" 
                dataframes_by_source[key_name] = df

In [13]:

rmse_bbr =pd.read_csv('../results/nrmse/regression_Vazao_bbr_nrmse.csv')

protocol = 'Vazao_bbr'
rmse = rmse_bbr

correlations_by_model = {}
variances = {}

for key, value in dataframes_by_source.items():
    dataset = dataframes_by_source[key]
    variance_coef = (dataset[protocol].std() / dataset[protocol].mean()) * 100
    variances[key] = variance_coef

models = rmse.columns[1:]

for model in models:
    df_model = pd.DataFrame({
        'Source': rmse['source'],
        'RMSE': rmse[model],
        'Variance_coef': rmse['source'].map(lambda x: variances[x])  
    })
    correlation = df_model[['RMSE', 'Variance_coef']].corr().iloc[0, 1]
    correlations_by_model[model] = correlation

# Exibindo as correlações
print("Correlation between high RMSE and dataset variance for each model:")
for model, correlation in correlations_by_model.items():
    print(f"{model}: {correlation:.3f}")
sum = 0
for model, correlation in correlations_by_model.items():
    sum+=correlation
print(sum/12)

Correlation between high RMSE and dataset variance for each model:
RandomForestRegressor: 0.990
LinearRegression: 0.995
PolynomialRegression: 0.991
GradientBoostingRegressor: 0.992
AdaBoostRegressor: 0.990
XGBRegressor: 0.992
MLPRegressor: 0.864
ElasticNet: 0.995
KNeighborsRegressor: 0.991
CatBoostRegressor: 0.992
LGBMRegressor: 0.990
SVR: 0.997
0.9815654200293608


In [14]:
#estatistica descritiva do nrmse por modelo

pd.set_option('display.float_format', lambda x: '%.3f' % x)

rmse_melted = rmse.melt(id_vars=['source'], 
                        var_name='Model', 
                        value_name='RMSE')
rmse_stats = rmse_melted.groupby('Model')['RMSE'].agg([
    ('Média', 'mean'),
    ('Mediana', 'median'),
    ('Desvio Padrão', 'std'),
    ('Mínimo', 'min'),
    ('Máximo', 'max')
]).round(4)

print("\nEstatísticas Descritivas do NRMSE por Modelo:")
print(rmse_stats.to_string())




Estatísticas Descritivas do NRMSE por Modelo:
                           Média  Mediana  Desvio Padrão  Mínimo  Máximo
Model                                                                   
AdaBoostRegressor          0.268    0.059          0.323   0.036   0.936
CatBoostRegressor          0.222    0.051          0.262   0.031   0.705
ElasticNet                 0.293    0.061          0.361   0.038   1.019
GradientBoostingRegressor  0.231    0.051          0.274   0.032   0.738
KNeighborsRegressor        0.247    0.057          0.287   0.036   0.775
LGBMRegressor              0.224    0.051          0.263   0.032   0.720
LinearRegression           0.293    0.061          0.361   0.038   1.019
MLPRegressor               0.344    0.092          0.361   0.038   1.038
PolynomialRegression       0.271    0.056          0.326   0.036   0.931
RandomForestRegressor      0.228    0.052          0.267   0.032   0.722
SVR                        0.355    0.066          0.444   0.040   1.258
XGBR