In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from scipy import stats
import matplotlib.pyplot as plt

In [9]:
def predict_and_evaluate_models(dataframe, target, features, source, nrmse_results_dict, prediction_results_folder):
    df_copy = dataframe.copy()
    X = df_copy[features]
    y = df_copy[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

    models = {
        "RandomForestRegressor": RandomForestRegressor(max_depth=10, n_estimators=200, random_state=42),
        "LinearRegression": LinearRegression(),
        "PolynomialRegression": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
        "GradientBoostingRegressor": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
        "AdaBoostRegressor": AdaBoostRegressor(n_estimators=100, random_state=42),
        "XGBRegressor": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
        "MLPRegressor": MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', max_iter=1000, random_state=42),
        "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42),
        "KNeighborsRegressor": KNeighborsRegressor(n_neighbors=3),
        "CatBoostRegressor": CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, random_state=42, verbose=False),
        "LGBMRegressor": LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=-1, random_state=42),
        "SVR": SVR(kernel='rbf', C=1.0, epsilon=0.1),
    }

    predictions = {
        "y_test": y_test.values
    }

    if target not in nrmse_results_dict:
        nrmse_results_dict[target] = []

    nrmse_results = {"source": source}  

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)


        predictions[f"y_predict_{model_name}"] = y_pred


        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmse_normalized = rmse / np.mean(y_test)

        nrmse_results[model_name] = rmse_normalized

    nrmse_results_dict[target].append(nrmse_results)


    predictions_df = pd.DataFrame(predictions)
    prediction_file_path = os.path.join(prediction_results_folder, f"{target}_{source}.csv")
    os.makedirs(prediction_results_folder, exist_ok=True)
    predictions_df.to_csv(prediction_file_path, index=False)
    return nrmse_results_dict

def save_all_nrmse_results(nrmse_results_dict, output_folder_base):
    os.makedirs(output_folder_base, exist_ok=True)
    for target, results in nrmse_results_dict.items():
        nrmse_results_df = pd.DataFrame(results)
        output_file_nrmse = os.path.join(output_folder_base, f"regression_{target}_nrmse.csv")
        nrmse_results_df.to_csv(output_file_nrmse, index=False)


def combine_datasets(path, substring):
    files = [f for f in os.listdir(path) if substring in f and f.endswith('.csv')]
    if not files:
        print(f"No files containing the substring '{substring}' were found.")
        return None
    
    dataframe_list = []
    for file in files:
        file_path = os.path.join(path, file)
        df = pd.read_csv(file_path)
        dataframe_list.append(df)
    
    combined_df = pd.concat(dataframe_list, ignore_index=True)
    return combined_df

def remove_outliers(dataframe, columns, n_std=3):
    df_clean = dataframe.copy()
    
    original_size = len(df_clean)
    mask = np.ones(len(df_clean), dtype=bool)
    
    for column in columns:
        if df_clean[column].dtype in ['int64', 'float64']:
            z_scores = np.abs(stats.zscore(df_clean[column], nan_policy='omit'))
            mask = mask & (z_scores < n_std)
    
    df_clean = df_clean[mask]
    
    removed_rows = original_size - len(df_clean)
    removal_percentage = (removed_rows / original_size) * 100
    #print(f"Removed {removed_rows} rows ({removal_percentage:.2f}%) as outliers from {dataframe.shape}")
    
    return df_clean


path = '../datasets/pos-process'
dataframes_by_source = {}

columns_to_check = ['Vazao_bbr', 'Vazao_cubic', 'Atraso(ms)', 'Hop_count']

for dirs, root, files in os.walk(path):
    for file in files:
        if file.endswith('.csv'):
            source = '-' + file.split('-')[2] + '-'
            df = combine_datasets(path, source)
            if df is not None and len(df)>300:
                df_clean = remove_outliers(df, columns_to_check)
                key_name = f"{source.strip('-')}"
                dataframes_by_source[key_name] = df_clean


target_columns = ['Vazao_bbr']
features = ['Atraso(ms)', 'Hop_count', 'Bottleneck']
folder_prediction = '../results/predictions-by-source'
folder_nrmse = '../results/nrmse'
prediction_results_folder = os.path.join(os.getcwd(), folder_prediction)
output_folder_base = os.path.join(os.getcwd(), folder_nrmse)

nrmse_results_dict = {}


for target in target_columns:
    for source, dataframe in dataframes_by_source.items():
        nrmse_results_dict = predict_and_evaluate_models(
            dataframe,
            target,
            features,
            source,
            nrmse_results_dict,
            prediction_results_folder
        )
save_all_nrmse_results(nrmse_results_dict, output_folder_base)

Removed 371 rows (4.47%) as outliers from (8297, 7)
Removed 495 rows (5.24%) as outliers from (9442, 7)
Removed 520 rows (5.59%) as outliers from (9297, 7)
Removed 150 rows (2.04%) as outliers from (7350, 7)
Removed 93 rows (6.32%) as outliers from (1471, 7)
Removed 414 rows (5.11%) as outliers from (8105, 7)
Removed 700 rows (7.06%) as outliers from (9908, 7)
Removed 321 rows (4.20%) as outliers from (7648, 7)
Removed 491 rows (5.57%) as outliers from (8817, 7)
Removed 529 rows (5.49%) as outliers from (9644, 7)
Removed 491 rows (5.57%) as outliers from (8817, 7)
Removed 700 rows (7.06%) as outliers from (9908, 7)
Removed 658 rows (7.29%) as outliers from (9030, 7)
Removed 95 rows (1.51%) as outliers from (6303, 7)
Removed 95 rows (1.51%) as outliers from (6303, 7)
Removed 167 rows (4.13%) as outliers from (4041, 7)
Removed 495 rows (5.24%) as outliers from (9442, 7)
Removed 478 rows (6.76%) as outliers from (7074, 7)
Removed 532 rows (5.49%) as outliers from (9697, 7)
Removed 272 row



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 274
[LightGBM] [Info] Number of data points in the train set: 7157, number of used features: 3
[LightGBM] [Info] Start training from score 1283319103.111499
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 273
[LightGBM] [Info] Number of data points in the train set: 7021, number of used features: 3
[LightGBM] [Info] Start training from score 925209473.713716




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 278
[LightGBM] [Info] Number of data points in the train set: 5760, number of used features: 3
[LightGBM] [Info] Start training from score 753303584.450000




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 273
[LightGBM] [Info] Number of data points in the train set: 1102, number of used features: 3
[LightGBM] [Info] Start training from score 1386965375.013612




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 273
[LightGBM] [Info] Number of data points in the train set: 6152, number of used features: 3
[LightGBM] [Info] Start training from score 899773099.412224




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000177 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 277
[LightGBM] [Info] Number of data points in the train set: 7366, number of used features: 3
[LightGBM] [Info] Start training from score 1700035829.615938




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 275
[LightGBM] [Info] Number of data points in the train set: 5861, number of used features: 3
[LightGBM] [Info] Start training from score 899515289.551954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000110 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 272
[LightGBM] [Info] Number of data points in the train set: 6660, number of used features: 3
[LightGBM] [Info] Start training from score 907561285.304505
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 275
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 274
[LightGBM] [Info] Number of data points in the train set: 6697, number of used features: 3
[LightGBM] [Info] Start training from score 1703599430.389876




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000114 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 275
[LightGBM] [Info] Number of data points in the train set: 4966, number of used features: 3
[LightGBM] [Info] Start training from score 2304445780.106122




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 274
[LightGBM] [Info] Number of data points in the train set: 3099, number of used features: 3
[LightGBM] [Info] Start training from score 903579885.991610




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 271
[LightGBM] [Info] Number of data points in the train set: 5276, number of used features: 3
[LightGBM] [Info] Start training from score 929666259.105383




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 273
[LightGBM] [Info] Number of data points in the train set: 7332, number of used features: 3
[LightGBM] [Info] Start training from score 1645226594.794463




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000103 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 274
[LightGBM] [Info] Number of data points in the train set: 6521, number of used features: 3
[LightGBM] [Info] Start training from score 920908065.486889




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 272
[LightGBM] [Info] Number of data points in the train set: 5941, number of used features: 3
[LightGBM] [Info] Start training from score 924676881.333109




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 273
[LightGBM] [Info] Number of data points in the train set: 6812, number of used features: 3
[LightGBM] [Info] Start training from score 905990753.700529




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 269
[LightGBM] [Info] Number of data points in the train set: 748, number of used features: 3
[LightGBM] [Info] Start training from score 900159028.534759




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 277
[LightGBM] [Info] Number of data points in the train set: 6820, number of used features: 3
[LightGBM] [Info] Start training from score 896773285.245748




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 276
[LightGBM] [Info] Number of data points in the train set: 6864, number of used features: 3
[LightGBM] [Info] Start training from score 1329494745.037442




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 272
[LightGBM] [Info] Number of data points in the train set: 5731, number of used features: 3
[LightGBM] [Info] Start training from score 927102669.378468




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 276
[LightGBM] [Info] Number of data points in the train set: 5176, number of used features: 3
[LightGBM] [Info] Start training from score 1538208284.525696
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 273
[LightGBM] [Info] Number of data points in the train set: 7522, number of used features: 3
[LightGBM] [Info] Start training from score 908615753.486839
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0
[LightGBM] [Info] Start training from score 847258080.000000


