## Single ID Case

### load scripts

In [1]:
from scripts.shared_imports import *

# Verify that the current working directory has changed
print("Current working directory:", os.getcwd())

# Importiere alle Module
from scripts.get_data import get_dataset_settings_singleID, preprocess_data_singleID
from Wrapper.wrapper import DRFWrapper, MLPRegressorWrapper
from scripts.utils import *
from scripts.train_and_evaluate_singleID import (  
    append_result, evaluate_and_append_models_singleID, create_cv_folds_singleID,preprocess_per_instance_singleID
)
from scripts.process_target import process_target_singleID

import scripts.config as config
from scripts.globals import global_fold_scores, global_cv_results, drf_cv_results

# Lade alle Module neu
importlib.reload(config)

os.environ['OMP_NUM_THREADS'] = '1'  
os.environ['OPENBLAS_NUM_THREADS'] = '1'  
os.environ['MKL_NUM_THREADS'] = '1'  
os.environ['NUMEXPR_NUM_THREADS'] = '1'  
os.environ['VECLIB_MAXIMUM_THREADS'] = '1' 


2025-01-22 18:32:14.631297: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-22 18:32:14.633510: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-22 18:32:14.671610: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-22 18:32:14.672693: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Current working directory: /workspaces/Masterthesis-DRF


### get data

-> choose dataset in config file before

In [2]:
importlib.reload(config)
dataset_name = config.dataset_name

# Hole die Datei-ID für den gewählten Datensatz
file_id = {
    'bakery': '1r_bDn9Z3Q_XgeTTkJL7352nUG3jkUM0z',
    'yaz': '1xrY3Uv5F9F9ofgSM7dVoSK4bE0gPMg36',
    'm5': '1tCBaxOgE5HHllvLVeRC18zvALBz6B-6w',
    'air': '1SKPpNxulcusNTjRwCC0p3C_XW7aNBNJZ',
    'wage': '1bn7E7NOoRzE4NwXXs1MYhRSKZHC13qYU',
}[config.dataset_name]

url = f"https://drive.google.com/uc?id={file_id}"
output = f"{dataset_name}.csv"
gdown.download(url, output, quiet=False)
data = pd.read_csv(output)

settings = get_dataset_settings_singleID(data)[dataset_name]

y, train_data, test_data, X_train_features, X_test_features, y_train, y_test = preprocess_data_singleID(
    data, settings['backscaling_columns'], settings['bool_columns'], settings['drop_columns'])


print("Data count Overview for Dataset",dataset_name )
print(f"total training data: {len(train_data)}")
print(f"total test data: {len(test_data)}")
print("targets:", len(y_train.columns))
print("N per Instance", len(y))


Downloading...
From: https://drive.google.com/uc?id=1xrY3Uv5F9F9ofgSM7dVoSK4bE0gPMg36
To: /workspaces/Masterthesis-DRF/yaz.csv
100%|██████████| 3.13M/3.13M [00:00<00:00, 69.3MB/s]

Data count Overview for Dataset yaz
total training data: 4627
total test data: 532
targets: 7
N per Instance 737





### process - DDOP Models + Levelset Estimator Models

Calculates the results for the ddop models and the Levelset estimator models

In [None]:

importlib.reload(config)

combinations = [(9, 1), (7.5, 2.5), (5, 5), (2.5, 7.5), (1, 9)]
table_rows = []
random_state = 42
cvFolds = None 

import scripts.globals as globals  # Import the globals module

for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)

    # Parallelize column processing within each combination
    column_results = Parallel(n_jobs=1)(  
        delayed(process_target_singleID)(column, cu, co, tau, y_train, X_train_features, X_test_features, y_test, random_state)
        for column in y_train.columns
    )

    # Combine results from all columns and print after each column
    for result in column_results:
        table_rows.extend(result)
        print(table_rows)
        result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
        print(result_table) 

result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])

# Define the folder where results will be saved
results_folder = "results"

# Check if the results folder exists, if not, create it
if not os.path.exists(results_folder):
    os.makedirs(results_folder)


# save Data
filename = os.path.join(results_folder, f"results_basic_Models_{dataset_name}.csv")
result_table.to_csv(filename, index=False)

# Aggregate and save cross-validation results at the end of the entire workflow
if globals.global_cv_results:
    aggregated_cv_results_df = pd.concat(globals.global_cv_results, ignore_index=True)
    aggregated_cv_filename = os.path.join(results_folder, f"cv_scores_basic_models_{dataset_name}.csv")
    aggregated_cv_results_df.to_csv(aggregated_cv_filename, index=False)

Processing cu, co combination: cu=9, co=1
Test length for column: 39 6 % of the actual ID: 661
Running model MLP for column calamari, cu=9, co=1
Evaluating model: MLP, cu: 9, co: 1
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


### process - DRF + Levelset Models

-> calculates the results for the DRF and takes the tuned hyperparamter of the Levelset estimators to create the LSX results

In [3]:
importlib.reload(config)

timeseries = True

# Execution starts here
combinations = [(1, 9)]
table_rows = []
random_state = 1
drf_cv_results = []
global_fold_scores = []

# Iterate over combinations and process them directly
for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)
    with threadpool_limits(limits=1):
            for column in y_train.columns:
                print(f"Processing column: {column}")

                # Preprocess data
                X_train_scaled, X_test_scaled, y_train_col, y_test_col, X_train_scaled_withID = preprocess_per_instance_singleID(
                    column, X_train_features, X_test_features, y_train, y_test
                )
                create_cv_folds_singleID(X_train_scaled_withID)

                print(X_test_scaled.shape)
                print(y_test_col.shape)
                # SAA model evaluation
                saa_model = SampleAverageApproximationNewsvendor(cu, co)
                saa_pred = saa_model.fit(y_train_col).predict(X_test_scaled.shape[0])
                saa_pinball_loss = pinball_loss(y_test_col.values.flatten(), saa_pred, tau)
                append_result(table_rows, column, cu, co, 'SAA', saa_pinball_loss, 'N/A', np.nan, tau)

            
            
                if timeseries:
                    # Initialisiere LGBM und MLP-Modelle
                    lgbm_model = LGBMRegressor(random_state=random_state, n_jobs=1, verbosity=-1)
                    mlp_model = MLPRegressorWrapper(random_state=random_state, early_stopping=True)

                    # LGBM-Modell mit GroupSplitting evaluieren
                    lgbm_model_params = get_grid('LevelSetKDEx_groupsplit', X_train_scaled.shape[1])
                    lgbm_model_evaluation = [
                        ('LS_KDEx_LGBM', LevelSetKDEx(estimator=lgbm_model, binSize=100, weightsByDistance=False), lgbm_model_params)
                    ]
                    evaluate_and_append_models_singleID(lgbm_model_evaluation, X_train_scaled, X_test_scaled, y_train_col, y_test_col, saa_pinball_loss, tau, cu, co, column, table_rows, timeseries)

                    # MLP-Modell mit GroupSplitting evaluieren
                    mlp_model_params = get_grid('LevelSetKDEx_groupsplit', X_train_scaled.shape[1])
                    mlp_model_evaluation = [
                        ('LS_KDEx_MLP', LevelSetKDEx(estimator=mlp_model, binSize=100, weightsByDistance=False), mlp_model_params)
                    ]
                    evaluate_and_append_models_singleID(mlp_model_evaluation, X_train_scaled, X_test_scaled, y_train_col, y_test_col, saa_pinball_loss, tau, cu, co, column, table_rows, timeseries)


                else:
                    # Placeholders resulting from previous changes - can be ignored. 
                    # Separate CV for non time series (Wage Dataset) moved to the create_cv_folds function.
                    print("set timeseries varible to True")

                # DRF-Modell wird immer ausgeführt
                drf_model = DRFWrapper(min_node_size=10, num_trees=100, num_threads=1)
                drf_grid = get_grid('DRF', X_train_scaled.shape[1])
                evaluate_and_append_models_singleID([('DRF', drf_model, drf_grid)], X_train_scaled, X_test_scaled, y_train_col, y_test_col, saa_pinball_loss, tau, cu, co, column, table_rows, timeseries)


                # Print the table after evaluating each column
                second_result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
                print(second_result_table.tail(5))  # Print the last 5 rows of the table after each column is processed


# Define the folder where results will be saved
results_folder = "results"

# Check if the results folder exists, if not, create it
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

# Final result table after processing all combinations
second_result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])


# Construct the filename and save it in the "results" folder
filename = os.path.join(results_folder, f"results_LevelsetModels_{dataset_name}.csv")
second_result_table.to_csv(filename, index=False)

print(f"Results saved as {filename}")

# Aggregating fold-wise cross-validation results
if global_fold_scores:  # No 'globals' function, just directly use 'global_fold_scores'
    # Reset multi-index for all fold scores
    global_fold_scores_flat = []
    for fold_scores_df in global_fold_scores:
        # Reset the multi-index so that the binSize and weightsByDistance become normal columns
        flat_df = fold_scores_df.reset_index()
        global_fold_scores_flat.append(flat_df)

    # Concatenate all fold-wise cross-validation results into a single DataFrame
    aggregated_fold_scores_df = pd.concat(global_fold_scores_flat, ignore_index=True)

    # Save the aggregated results to a CSV file in the "results" folder
    aggregated_fold_scores_filename = os.path.join(results_folder, f"cv_scores_levelset_models_{dataset_name}.csv")
    aggregated_fold_scores_df.to_csv(aggregated_fold_scores_filename, index=False)

    ### DRF DATA INSERTED INTO THE MAIN TABLE WHERE OTHER BAYES CVs ARE STORED
    aggregated_drf_cv_results_df = pd.concat(drf_cv_results, ignore_index=True)

    # Save the aggregated DRF results to a CSV file in the "results" folder
    aggregated_cv_filename = os.path.join(results_folder, f"cv_drf_scores_{dataset_name}.csv")
    aggregated_drf_cv_results_df.to_csv(aggregated_cv_filename, index=False)


Processing cu, co combination: cu=1, co=9
Processing column: calamari
Test length for column: 39 6 % of the actual ID: 661
(76, 57)
(76,)
Evaluating model: LS_KDEx_LGBM, cu: 1, co: 9
Performing LevelSetKDEx TimeSeries cross-validation...
Evaluating model: LS_KDEx_MLP, cu: 1, co: 9
Performing LevelSetKDEx TimeSeries cross-validation...
Evaluating model: DRF, cu: 1, co: 9
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]


RRuntimeError: Error: Prediction for sample 270 did not have the expected length.


## FULL Dataset Cases


In [4]:
from scripts.shared_imports import *

# Verify that the current working directory has changed
print("Current working directory:", os.getcwd())

# Importiere alle Module
from scripts.get_data import get_dataset_settings_alldata, preprocess_data_alldata
from Wrapper.wrapper import DRFWrapper, MLPRegressorWrapper
from scripts.utils import *
from scripts.train_and_evaluate_alldata import (  
    evaluate_and_append_models_alldata, create_cv_folds_alldata, preprocess_per_instance_alldata
)
from scripts.process_target import process_target_alldata

import scripts.config as config
from scripts.globals import global_fold_scores, global_cv_results, drf_cv_results

# Lade alle Module neu
importlib.reload(config)

os.environ['OMP_NUM_THREADS'] = '1'  # OpenMP Threads auf 4 beschränken
os.environ['OPENBLAS_NUM_THREADS'] = '1'  # Für OpenBLAS
os.environ['MKL_NUM_THREADS'] = '1'  # Für Intel MKL (falls verwendet)
os.environ['NUMEXPR_NUM_THREADS'] = '1'  # Für NumExpr
os.environ['VECLIB_MAXIMUM_THREADS'] = '1'  # Für MacOS Accelerate



Current working directory: /workspaces/Masterthesis-DRF


In [7]:
importlib.reload(config)
dataset_name = config.dataset_name

# Hole die Datei-ID für den gewählten Datensatz
file_id = {
    'subset_bakery': '1r_bDn9Z3Q_XgeTTkJL7352nUG3jkUM0z',
    'yaz': '1xrY3Uv5F9F9ofgSM7dVoSK4bE0gPMg36',
    'subset_m5': '1tCBaxOgE5HHllvLVeRC18zvALBz6B-6w',
    'subset_air': '1DMOaV92n3BFEGeCubaxEys2eLzg2Cic3',
    'wage': '1bn7E7NOoRzE4NwXXs1MYhRSKZHC13qYU',
}[config.dataset_name]


url = f"https://drive.google.com/uc?id={file_id}"


# Datei herunterladen
output = f"{dataset_name}.csv"
gdown.download(url, output, quiet=False)
data = pd.read_csv(output)
settings = get_dataset_settings_alldata(data)[dataset_name]

y, train_data, test_data, X_train_features, X_test_features, y_train, y_test, data, dataset_name  = preprocess_data_alldata(
    data=data,
    dataset_name=dataset_name,
    bool_columns=settings['bool_columns'],
    drop_columns=settings['drop_columns'],
    drop_keywords=settings['drop_keywords'],
)

display(y_train.head(3))
print(f"Anzahl der Zeilen: {len(y)}")
print("Anzahl der targets:", len(y_train))
print("Anzahl der targets:", len(y_test))

Downloading...
From: https://drive.google.com/uc?id=1xrY3Uv5F9F9ofgSM7dVoSK4bE0gPMg36
To: /workspaces/Masterthesis-DRF/yaz.csv
100%|██████████| 3.13M/3.13M [00:00<00:00, 107MB/s]


Unnamed: 0,dayIndex,is_holiday,is_closed,weekend,wind,clouds,rain,sunshine,label,scalingValue,demand__sum_values_7,demand__median_7,demand__mean_7,demand__standard_deviation_7,demand__variance_7,demand__root_mean_square_7,demand__maximum_7,demand__absolute_maximum_7,demand__minimum_7,demand__sum_values_14,demand__median_14,demand__mean_14,demand__standard_deviation_14,demand__variance_14,demand__root_mean_square_14,demand__maximum_14,demand__absolute_maximum_14,demand__minimum_14,demand__sum_values_28,demand__median_28,demand__mean_28,demand__standard_deviation_28,demand__variance_28,demand__root_mean_square_28,demand__maximum_28,demand__absolute_maximum_28,demand__minimum_28,demand,weekday_FRI,weekday_MON,weekday_SAT,weekday_SUN,weekday_THU,weekday_TUE,weekday_WED,month_APR,month_AUG,month_DEC,month_FEB,month_JAN,month_JUL,month_JUN,month_MAR,month_MAY,month_NOV,month_OCT,month_SEP,year_2013,year_2014,year_2015,item_calamari,item_chicken,item_fish,item_koefte,item_lamb,item_shrimp,item_steak,id_for_CV,dummyID
0,29.0,1.0,0.0,0.0,2.6,7.7,0.0,0.0,train,25.0,1.400000,0.200000,0.200000,0.052372,0.002743,0.206743,0.320000,0.320000,0.160000,3.080000,0.200000,0.220000,0.070508,0.004971,0.231023,0.400000,0.400000,0.120000,5.880000,0.200000,0.210000,0.080445,0.006471,0.224881,0.400000,0.400000,0.040000,0.000000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,calamari,dummyID
1,30.0,0.0,0.0,1.0,4.4,7.1,0.0,60.0,train,25.0,1.200000,0.160000,0.171429,0.087412,0.007641,0.192428,0.320000,0.320000,0.000000,2.800000,0.200000,0.200000,0.088156,0.007771,0.218567,0.400000,0.400000,0.000000,5.640000,0.200000,0.201429,0.089111,0.007941,0.220260,0.400000,0.400000,0.000000,1.000000,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,calamari,dummyID
2,31.0,0.0,0.0,1.0,4.8,3.0,0.0,258.0,train,25.0,2.040000,0.200000,0.291429,0.302156,0.091298,0.419796,1.000000,1.000000,0.000000,3.400000,0.200000,0.242857,0.220889,0.048792,0.328286,1.000000,1.000000,0.000000,6.320000,0.200000,0.225714,0.172118,0.029624,0.283851,1.000000,1.000000,0.000000,0.120000,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,calamari,dummyID
3,32.0,0.0,0.0,0.0,5.1,7.6,0.1,19.0,train,25.0,1.840000,0.160000,0.262857,0.307511,0.094563,0.404546,1.000000,1.000000,0.000000,3.280000,0.200000,0.234286,0.223150,0.049796,0.323552,1.000000,1.000000,0.000000,6.200000,0.200000,0.221429,0.173199,0.029998,0.281120,1.000000,1.000000,0.000000,0.000000,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,calamari,dummyID
4,33.0,0.0,0.0,0.0,3.5,4.4,0.3,314.0,train,25.0,1.640000,0.160000,0.234286,0.321019,0.103053,0.397420,1.000000,1.000000,0.000000,3.080000,0.180000,0.220000,0.231146,0.053429,0.319106,1.000000,1.000000,0.000000,6.040000,0.200000,0.215714,0.177712,0.031582,0.279489,1.000000,1.000000,0.000000,0.080000,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,calamari,dummyID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5078,685.0,0.0,0.0,0.0,2.5,7.8,1.0,74.0,train,82.0,1.731707,0.256098,0.247387,0.081342,0.006617,0.260417,0.378049,0.378049,0.109756,3.073171,0.207317,0.219512,0.074180,0.005503,0.231707,0.378049,0.378049,0.097561,5.853659,0.213415,0.209059,0.096560,0.009324,0.230282,0.402439,0.402439,0.012195,0.231707,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,steak,dummyID
5079,686.0,0.0,0.0,0.0,1.6,6.2,0.0,211.0,train,82.0,1.646341,0.231707,0.235192,0.076218,0.005809,0.247233,0.378049,0.378049,0.109756,3.036585,0.207317,0.216899,0.073051,0.005337,0.228870,0.378049,0.378049,0.097561,5.804878,0.213415,0.207317,0.095692,0.009157,0.228336,0.402439,0.402439,0.012195,0.280488,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,steak,dummyID
5080,687.0,0.0,0.0,0.0,1.9,1.8,0.0,566.0,train,82.0,1.817073,0.256098,0.259582,0.057094,0.003260,0.265787,0.378049,0.378049,0.182927,3.146341,0.225610,0.224739,0.073564,0.005412,0.236472,0.378049,0.378049,0.097561,5.975610,0.225610,0.213415,0.094716,0.008971,0.233488,0.402439,0.402439,0.012195,0.231707,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,steak,dummyID
5081,688.0,0.0,0.0,1.0,2.9,2.8,0.0,543.0,train,82.0,1.792683,0.231707,0.256098,0.057938,0.003357,0.262570,0.378049,0.378049,0.182927,3.121951,0.225610,0.222997,0.073088,0.005342,0.234668,0.378049,0.378049,0.097561,5.926829,0.225610,0.211672,0.093911,0.008819,0.231570,0.402439,0.402439,0.012195,0.256098,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,steak,dummyID


Unnamed: 0_level_0,dummyID
id_for_CV,Unnamed: 1_level_1
calamari,0.0
calamari,1.0
calamari,0.12


Anzahl der Zeilen: 5159
Anzahl der targets: 4627
Anzahl der targets: 532


### DDOP Models

In [None]:

# Execution starts here
combinations = [(9, 1), (7.5, 2.5), (5, 5), (2.5, 7.5), (1, 9)]
table_rows = []
random_state = 1
global_cv_results = []

# Initialize cvFolds
cvFolds = None  # Initialization

for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)

    column_results = Parallel(n_jobs=1)(
        delayed(process_target_alldata)(column, cu, co, tau, y_train, X_train_features, X_test_features, y_test, random_state)
        for column in y_train.columns
    )

    for result in column_results:
        table_rows.extend(result)
        print(table_rows)
        # Convert the latest result to a DataFrame and print it
        result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
        print(result_table)  # Print the updated results after each column is processed

# Final result table after processing all combinations
result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])


# safe data
filename = f"FULLDATASET_results_basic_Models_{dataset_name}.csv"
result_table.to_csv(filename, index=False)

print(f"Results saved as {filename}")

# Aggregate and save cross-validation results at the end of the entire workflow
if global_cv_results:

    aggregated_cv_results_df = pd.concat(global_cv_results, ignore_index=True)

    print("Aggregated cross-validation results sample:")
    print(aggregated_cv_results_df.head(5))  # Print the first 5 rows as a sample

    aggregated_cv_filename = f"FULLDATASET_cv_scores_basic_models_{dataset_name}.csv"
    aggregated_cv_results_df.to_csv(aggregated_cv_filename, index=False)
    print(f"Aggregated cross-validation results saved as {aggregated_cv_filename}")


### DRF+Leveset Models

In [8]:
timeseries = True
import scripts.config as config
from scripts.config import *
importlib.reload(config)

combinations = [(9, 1), (7.5, 2.5), (5, 5), (2.5, 7.5), (1, 9)]
table_rows = []
random_state = 1
drf_cv_results = []
global_fold_scores = []

# Iterate over combinations and process them directly
for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)
    with threadpool_limits(limits=1):
            for column in y_train.columns:
                print(f"Processing column: {column}")


                X_train_scaled, X_test_scaled, y_train_col, y_test_col, X_train_scaled_withID, X_test_scaled_withID = preprocess_per_instance_alldata(
                    column, X_train_features, X_test_features, y_train, y_test
                )

                create_cv_folds_alldata(X_train_scaled_withID)
                
                # SAA model
                saa_model = SampleAverageApproximationNewsvendor(cu, co)
                saa_pred = saa_model.fit(y_train_col).predict(X_test_scaled.shape[0])


                # Ensure id_for_CV, y_true, and y_pred are 1-D arrays
                id_for_CV = X_test_scaled_withID['id_for_CV'].values.flatten()
                y_true = y_test_col.values.flatten()
                y_pred = saa_pred.flatten()  # Flatten y_pred to ensure it's 1-D

                # Create DataFrame for SAA predictions
                saa_predictions_df = pd.DataFrame({
                    'id_for_CV': id_for_CV,
                    'y_true': y_true,
                    'y_pred': y_pred  # Use the flattened y_pred here
                })

                saa_pinball_losses_per_id = {}
                grouped_saa = saa_predictions_df.groupby('id_for_CV')
                for id_val, group in grouped_saa:
                    y_true_id = group['y_true'].values
                    y_pred_id = group['y_pred'].values
                    pinball_loss_id = pinball_loss(y_true_id, y_pred_id, tau)
                    saa_pinball_losses_per_id[id_val] = pinball_loss_id
                    append_result(table_rows, id_val, cu, co, 'SAA', pinball_loss_id, 'N/A', np.nan, tau)

                n_features = X_train_scaled.shape[1]

                drf_model = DRFWrapper(min_node_size=10, num_trees=100, num_threads=2)
                drf_grid = get_grid('DRF', X_train_scaled.shape[1])
                
                print(f"Length of X_train_scaled: {len(X_train_scaled)}")
                print(f"Length of X_test_scaled: {len(X_test_scaled)}")
                evaluate_and_append_models_alldata([('DRF', drf_model, drf_grid)], X_train_scaled, X_test_scaled, y_train_col, y_test_col, saa_pinball_losses_per_id, tau, cu, co, column, table_rows, timeseries, X_test_scaled_withID)


            
                if timeseries:
                    # Initialisiere LGBM und MLP-Modelle
                    lgbm_model = LGBMRegressor(random_state=random_state, n_jobs=n_jobs, verbosity=-1)
                    mlp_model = MLPRegressorWrapper(random_state=random_state, early_stopping=True)

                    # LGBM-Modell mit GroupSplitting evaluieren
                    lgbm_model_params = get_grid('LevelSetKDEx_groupsplit', X_train_scaled.shape[1])
                    print(lgbm_model_params)

                    lgbm_model_evaluation = [
                        ('LS_KDEx_LGBM', LevelSetKDEx(estimator=lgbm_model, binSize=100, weightsByDistance=False), lgbm_model_params)
                    ]
                    evaluate_and_append_models_alldata(lgbm_model_evaluation, X_train_scaled, X_test_scaled,
                                       y_train_col, y_test_col, saa_pinball_losses_per_id,
                                       tau, cu, co, column, table_rows, timeseries, X_test_scaled_withID)

                    # MLP-Modell mit GroupSplitting evaluieren
                    mlp_model_params = get_grid('LevelSetKDEx_groupsplit', X_train_scaled.shape[1])
                    mlp_model_evaluation = [
                        ('LS_KDEx_MLP', LevelSetKDEx(estimator=mlp_model, binSize=100, weightsByDistance=False), mlp_model_params)
                    ]
                    evaluate_and_append_models_alldata(mlp_model_evaluation, X_train_scaled, X_test_scaled,
                                       y_train_col, y_test_col, saa_pinball_losses_per_id,
                                       tau, cu, co, column, table_rows, timeseries, X_test_scaled_withID)


                else:
                    # Placeholders resulting from previous changes - can be ignored. 
                    # Separate CV for non time series (Wage Dataset) moved to the create_cv_folds function.
                    print("set timeseries varible to True")

                # Print the table after evaluating each column
                second_result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
                print(second_result_table.tail(5))  # Print the last 5 rows of the table after each column is processed


# Final result table after processing all combinations
second_result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])


# safe data
filename = f"FULLDATASET_results_LevelsetModels_{dataset_name}.csv"

second_result_table.to_csv(filename, index=False)
results_folder = "results"

from scripts.globals import global_fold_scores, global_cv_results, drf_cv_results

filename = os.path.join(results_folder, f"FULLDATASET_results_LevelsetModels_{dataset_name}.csv")
second_result_table.to_csv(filename, index=False)

print(f"Results saved as {filename}")

# Aggregating fold-wise cross-validation results
if global_fold_scores:  # No 'globals' function, just directly use 'global_fold_scores'
    # Reset multi-index for all fold scores
    global_fold_scores_flat = []
    for fold_scores_df in global_fold_scores:
        # Reset the multi-index so that the binSize and weightsByDistance become normal columns
        flat_df = fold_scores_df.reset_index()
        global_fold_scores_flat.append(flat_df)

    aggregated_fold_scores_df = pd.concat(global_fold_scores_flat, ignore_index=True)

    aggregated_fold_scores_filename = os.path.join(results_folder, f"FULLDATASET_cv_scores_levelset_models_{dataset_name}.csv")
    aggregated_fold_scores_df.to_csv(aggregated_fold_scores_filename, index=False)

    aggregated_drf_cv_results_df = pd.concat(drf_cv_results, ignore_index=True)

    aggregated_cv_filename = os.path.join(results_folder, f"cv_drf_scores_{dataset_name}.csv")
    aggregated_drf_cv_results_df.to_csv(aggregated_cv_filename, index=False)


Processing cu, co combination: cu=9, co=1
Processing column: dummyID
Test length for column: 39 (6% of 661 Datapoints per Group)
Length of X_train_scaled: 4627
Length of X_test_scaled: 532
Evaluating model: DRF, cu: 9, co: 1
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]
[0.9]
Bayesian search failed: Error: Prediction for sample 780 did not have the expected length.



RRuntimeError: Error: Prediction for sample 780 did not have the expected length.
