In [None]:
print("main")

## Single ID Case

### load scripts

In [None]:
from scripts.shared_imports import *

# Verify that the current working directory has changed
print("Current working directory:", os.getcwd())

# Importiere alle Module
from scripts.get_data import get_dataset_settings_singleID, preprocess_data_singleID
from Wrapper.wrapper import DRFWrapper, MLPRegressorWrapper
from scripts.utils import *
from scripts.train_and_evaluate_singleID import (  
    append_result, evaluate_and_append_models_singleID, create_cv_folds_singleID,preprocess_per_instance_singleID
)
from scripts.process_target import process_target_singleID

import scripts.config as config
from scripts.globals import global_fold_scores, global_cv_results, drf_cv_results

# Lade alle Module neu
importlib.reload(config)

os.environ['OMP_NUM_THREADS'] = '1'  # OpenMP Threads auf 4 beschränken
os.environ['OPENBLAS_NUM_THREADS'] = '1'  # Für OpenBLAS
os.environ['MKL_NUM_THREADS'] = '1'  # Für Intel MKL (falls verwendet)
os.environ['NUMEXPR_NUM_THREADS'] = '1'  # Für NumExpr
os.environ['VECLIB_MAXIMUM_THREADS'] = '1'  # Für MacOS Accelerate


### get data

set configurations in config.py file before we start process

--> set dataset_name before in config file

--> set levelset_calcuations to False if we do the basic models calcuations

In [None]:
dataset_name = config.dataset_name

# Hole die Datei-ID für den gewählten Datensatz
file_id = {
    'bakery': '1r_bDn9Z3Q_XgeTTkJL7352nUG3jkUM0z',
    'yaz': '1xrY3Uv5F9F9ofgSM7dVoSK4bE0gPMg36',
    'm5': '1tCBaxOgE5HHllvLVeRC18zvALBz6B-6w',
    'sid': '1J9bPCfeLDH-mbSnvTHRoCva7pl6cXD3_',
    'air': '1SKPpNxulcusNTjRwCC0p3C_XW7aNBNJZ',
    "copula": '1H5wdJgmxdhbzeS17w0NkRlHRCESEAd-e',
    'wage': '1bn7E7NOoRzE4NwXXs1MYhRSKZHC13qYU',
}[config.dataset_name]


url = f"https://drive.google.com/uc?id={file_id}"


# Datei herunterladen
output = f"{dataset_name}.csv"
gdown.download(url, output, quiet=False)
data = pd.read_csv(output)

# Erstelle die Dataset-Einstellungen basierend auf den geladenen Daten
settings = get_dataset_settings_singleID(data)[dataset_name]

y, train_data, test_data, X_train_features, X_test_features, y_train, y_test = preprocess_data_singleID(
    data, settings['backscaling_columns'], settings['bool_columns'], settings['drop_columns'])


display(X_train_features.head(30))
display(y_train.head(3))
print(f"Anzahl der Zeilen: {len(y_train)}")
print("Anzahl der targets:", len(y_train.columns))




### process - DDOP Models + Levelset Estimator Models

In [None]:
importlib.reload(config)

In [None]:

# Execution starts here
combinations = [(9, 1), (7.5, 2.5), (5, 5), (2.5, 7.5), (1, 9)]
table_rows = []
random_state = 42

# Initialize cvFolds
cvFolds = None  # Initialization


import scripts.globals as globals  # Import the globals module

for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)

    # Parallelize column processing within each combination with n_jobs=4 to limit threads
    column_results = Parallel(n_jobs=1)(  
        delayed(process_target_singleID)(column, cu, co, tau, y_train, X_train_features, X_test_features, y_test, random_state)
        for column in y_train.columns
    )

    # Combine results from all columns and print after each column
    for result in column_results:
        table_rows.extend(result)
        print(table_rows)
        # Convert the latest result to a DataFrame and print it
        result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
        print(result_table)  # Print the updated results after each column is processed

# Final result table after processing all combinations
result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])

# Define the folder where results will be saved
results_folder = "results"

# Check if the results folder exists, if not, create it
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

# Construct the filename using the format "results_basic_Models_{dataset_name}.csv"
filename = os.path.join(results_folder, f"results_basic_Models_{dataset_name}.csv")

# Save the result table to a CSV file in the "results" folder
result_table.to_csv(filename, index=False)

# Aggregate and save cross-validation results at the end of the entire workflow
if globals.global_cv_results:
    # Concatenate all cross-validation results into a single DataFrame
    aggregated_cv_results_df = pd.concat(globals.global_cv_results, ignore_index=True)

    # Save the aggregated results to a CSV file in the "results" folder
    aggregated_cv_filename = os.path.join(results_folder, f"cv_scores_basic_models_{dataset_name}.csv")
    aggregated_cv_results_df.to_csv(aggregated_cv_filename, index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter out both 'SAA' and 'LinearRegression' models
filtered_table = result_table[(~result_table['Model'].isin(['SAA', 'LR', 'MLP','LGBM'])) & (result_table['sl'] == 0.9)]

# Set the figure size
plt.figure(figsize=(10, 6))

# Create the boxplot
sns.boxplot(x='Model', y='delta C', data=filtered_table, showfliers=False, width=0.5, color='lightblue')

 #Add the stripplot to show the individual data points
#sns.stripplot(x='Model', y='delta C', data=filtered_table, color='red', jitter=True, size=6, alpha=0.7)

# Add the point plot to show CI based on SD without horizontal lines
sns.pointplot(x="Model", y="delta C", data=filtered_table, ci='sd', color='blue', markers="o", scale=0.7, linestyles="")

# Add title and labels
plt.title('Boxplot, Stripplot, and Pointplot (CI=SD) of Delta C for Each Model (Excluding SAA and LinearRegression)', fontsize=14)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Delta C', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()

### process - DRF + Levelset Models

In [None]:
importlib.reload(config)

timeseries = True

# Execution starts here
combinations = [(1, 9)]
table_rows = []
random_state = 1
drf_cv_results = []
global_fold_scores = []

# Iterate over combinations and process them directly
for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)
    with threadpool_limits(limits=1):
            for column in y_train.columns:
                print(f"Processing column: {column}")

                # Preprocess data
                X_train_scaled, X_test_scaled, y_train_col, y_test_col, X_train_scaled_withID = preprocess_per_instance_singleID(
                    column, X_train_features, X_test_features, y_train, y_test
                )
                create_cv_folds_singleID(X_train_scaled_withID)
            

                # SAA model evaluation
                saa_model = SampleAverageApproximationNewsvendor(cu, co)
                saa_pred = saa_model.fit(y_train_col).predict(X_test_scaled.shape[0])
                saa_pinball_loss = pinball_loss(y_test_col.values.flatten(), saa_pred, tau)
                append_result(table_rows, column, cu, co, 'SAA', saa_pinball_loss, 'N/A', np.nan, tau)

            
            
                if timeseries:
                    # Initialisiere LGBM und MLP-Modelle
                    lgbm_model = LGBMRegressor(random_state=random_state, n_jobs=1, verbosity=-1)
                    mlp_model = MLPRegressorWrapper(random_state=random_state, early_stopping=True)

                    # LGBM-Modell mit GroupSplitting evaluieren
                    lgbm_model_params = get_grid('LevelSetKDEx_groupsplit', X_train_scaled.shape[1])
                    lgbm_model_evaluation = [
                        ('LS_KDEx_LGBM', LevelSetKDEx(estimator=lgbm_model, binSize=100, weightsByDistance=False), lgbm_model_params)
                    ]
                    evaluate_and_append_models_singleID(lgbm_model_evaluation, X_train_scaled, X_test_scaled, y_train_col, y_test_col, saa_pinball_loss, tau, cu, co, column, table_rows, timeseries)

                    # MLP-Modell mit GroupSplitting evaluieren
                    mlp_model_params = get_grid('LevelSetKDEx_groupsplit', X_train_scaled.shape[1])
                    mlp_model_evaluation = [
                        ('LS_KDEx_MLP', LevelSetKDEx(estimator=mlp_model, binSize=100, weightsByDistance=False), mlp_model_params)
                    ]
                    evaluate_and_append_models_singleID(mlp_model_evaluation, X_train_scaled, X_test_scaled, y_train_col, y_test_col, saa_pinball_loss, tau, cu, co, column, table_rows, timeseries)


                else:
                    # Although the Wage data set is not a timeseries, it works similarly well on group timeseries splits. 
                    # To do this, we set shuffle = True beforehand in the preprocessing for train/test split. 
                    # Resulting in an mixed order of the train/test points, even if we order it by the "Dayindex" later in the CV splits.
                    # saves the extra work and we can work the same split logic for all datasets
                    print("Only time series Data")

                # DRF-Modell wird immer ausgeführt
                drf_model = DRFWrapper(min_node_size=10, num_trees=100, num_threads=12)
                drf_grid = get_grid('DRF', X_train_scaled.shape[1])
                evaluate_and_append_models_singleID([('DRF', drf_model, drf_grid)], X_train_scaled, X_test_scaled, y_train_col, y_test_col, saa_pinball_loss, tau, cu, co, column, table_rows, timeseries)


                # Print the table after evaluating each column
                second_result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
                print(second_result_table.tail(5))  # Print the last 5 rows of the table after each column is processed


# Define the folder where results will be saved
results_folder = "results"

# Check if the results folder exists, if not, create it
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

# Final result table after processing all combinations
second_result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])


# Construct the filename and save it in the "results" folder
filename = os.path.join(results_folder, f"results_LevelsetModels_{dataset_name}.csv")
second_result_table.to_csv(filename, index=False)

print(f"Results saved as {filename}")

# Aggregating fold-wise cross-validation results
if global_fold_scores:  # No 'globals' function, just directly use 'global_fold_scores'
    # Reset multi-index for all fold scores
    global_fold_scores_flat = []
    for fold_scores_df in global_fold_scores:
        # Reset the multi-index so that the binSize and weightsByDistance become normal columns
        flat_df = fold_scores_df.reset_index()
        global_fold_scores_flat.append(flat_df)

    # Concatenate all fold-wise cross-validation results into a single DataFrame
    aggregated_fold_scores_df = pd.concat(global_fold_scores_flat, ignore_index=True)

    # Save the aggregated results to a CSV file in the "results" folder
    aggregated_fold_scores_filename = os.path.join(results_folder, f"cv_scores_levelset_models_{dataset_name}.csv")
    aggregated_fold_scores_df.to_csv(aggregated_fold_scores_filename, index=False)

    ### DRF DATA INSERTED INTO THE MAIN TABLE WHERE OTHER BAYES CVs ARE STORED
    aggregated_drf_cv_results_df = pd.concat(drf_cv_results, ignore_index=True)

    # Save the aggregated DRF results to a CSV file in the "results" folder
    aggregated_cv_filename = os.path.join(results_folder, f"cv_drf_scores_{dataset_name}.csv")
    aggregated_drf_cv_results_df.to_csv(aggregated_cv_filename, index=False)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter out specific models and only include rows where 'sl' equals 0.9
filtered_table = second_result_table[
    (~second_result_table['Model'].isin(['SAA', 'LR', 'MLP', 'LGBM'])) & 
    (second_result_table['sl'] == 0.9)
]

# Set the figure size
plt.figure(figsize=(10, 6))

# Create the boxplot
sns.boxplot(x='Model', y='delta C', data=filtered_table, showfliers=False, width=0.5, color='lightblue')

# Add the point plot to show CI based on SD without horizontal lines
sns.pointplot(x="Model", y="delta C", data=filtered_table, ci='sd', color='blue', markers="o", scale=0.7, linestyles="")

# Add title and labels
plt.title('Boxplot and Pointplot (CI=SD) of Delta C for Each Model (Excluding SAA and LinearRegression)', fontsize=14)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Delta C', fontsize=12)

# Show the plot with tight layout
plt.tight_layout()
plt.show()


## FULL Dataset Cases


In [1]:
from scripts.shared_imports import *

# Verify that the current working directory has changed
print("Current working directory:", os.getcwd())

# Importiere alle Module
from scripts.get_data import get_dataset_settings_alldata, preprocess_data_alldata
from Wrapper.wrapper import DRFWrapper, MLPRegressorWrapper
from scripts.utils import *
from scripts.train_and_evaluate_alldata import (  
    evaluate_and_append_models_alldata, create_cv_folds_alldata, preprocess_per_instance_alldata
)
from scripts.process_target import process_target_alldata

import scripts.config as config
from scripts.globals import global_fold_scores, global_cv_results, drf_cv_results

# Lade alle Module neu
importlib.reload(config)

os.environ['OMP_NUM_THREADS'] = '1'  # OpenMP Threads auf 4 beschränken
os.environ['OPENBLAS_NUM_THREADS'] = '1'  # Für OpenBLAS
os.environ['MKL_NUM_THREADS'] = '1'  # Für Intel MKL (falls verwendet)
os.environ['NUMEXPR_NUM_THREADS'] = '1'  # Für NumExpr
os.environ['VECLIB_MAXIMUM_THREADS'] = '1'  # Für MacOS Accelerate



2024-11-19 18:56:57.843138: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-19 18:56:57.845985: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-19 18:56:57.890051: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-19 18:56:57.893621: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Current working directory: /root/WorkingFolder


In [4]:
dataset_name = config.dataset_name

# Hole die Datei-ID für den gewählten Datensatz
file_id = {
    'bakery': '1r_bDn9Z3Q_XgeTTkJL7352nUG3jkUM0z',
    'yaz': '1xrY3Uv5F9F9ofgSM7dVoSK4bE0gPMg36',
    'm5': '1tCBaxOgE5HHllvLVeRC18zvALBz6B-6w',
    'sid': '1J9bPCfeLDH-mbSnvTHRoCva7pl6cXD3_',
    'air': '1DMOaV92n3BFEGeCubaxEys2eLzg2Cic3',
    "copula": '1H5wdJgmxdhbzeS17w0NkRlHRCESEAd-e',
    'wage': '1bn7E7NOoRzE4NwXXs1MYhRSKZHC13qYU',
}[config.dataset_name]


url = f"https://drive.google.com/uc?id={file_id}"


# Datei herunterladen
output = f"{dataset_name}.csv"
gdown.download(url, output, quiet=False)
data = pd.read_csv(output)

# Erstelle die Dataset-Einstellungen basierend auf den geladenen Daten
settings = get_dataset_settings_alldata(data)[dataset_name]
sample_size = 50

y, train_data, test_data, X_train_features, X_test_features, y_train, y_test, data, dataset_name  = preprocess_data_alldata(
    data=data,
    dataset_name=dataset_name,
    bool_columns=settings['bool_columns'],
    drop_columns=settings['drop_columns'],
    sample_size=sample_size
)

display(y_train.head(3))
print(f"Anzahl der Zeilen: {len(y_train)}")
print("Anzahl der targets:", len(y_train.columns))

Downloading...
From: https://drive.google.com/uc?id=1bn7E7NOoRzE4NwXXs1MYhRSKZHC13qYU
To: /root/WorkingFolder/wage.csv
100%|██████████| 3.25M/3.25M [00:00<00:00, 29.5MB/s]


Warnung: sample_size (50) ist größer als die Anzahl der verfügbaren eindeutigen IDs (6). Die Stichprobengröße wird auf 6 reduziert.


Unnamed: 0,demand,dayIndex,label,age,citizenship,family_size,children,english_level,race_AIAN,race_asian,race_black,race_mix,race_NHOPI,race_other,race_white,hispanic_origin_no,hispanic_origin_yes,nativity_foreign-born,nativity_native,marital_divorced,marital_married,marital_never married,marital_separated,marital_widowed,employer_for-profit company,employer_government,employer_non-profit company,employer_self-employed,economic_region_Abroad,economic_region_Far West,economic_region_Great Lakes,economic_region_Mideast,economic_region_New England,economic_region_Plains,economic_region_Rocky Mountain,economic_region_Southeast,economic_region_Southwest,occupation_11,occupation_13,occupation_15,occupation_17,occupation_19,occupation_21,occupation_23,occupation_25,occupation_27,occupation_29,occupation_31,occupation_33,occupation_35,occupation_37,occupation_39,occupation_41,occupation_43,occupation_45,occupation_47,occupation_49,occupation_51,occupation_53,occupation_55,industry_11,industry_21,industry_22,industry_23,industry_31,industry_41,industry_44,industry_48,industry_51,industry_52,industry_53,industry_54,industry_55,industry_56,industry_61,industry_62,industry_71,industry_72,industry_81,industry_91,male,id_for_CV,dummyID
0,1.406497,1,train,17,1,2,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,16,dummyID
1,2.322788,2,train,18,1,4,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,16,dummyID
2,2.035106,3,train,18,1,6,2,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,16,dummyID
3,2.436116,4,train,18,1,4,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,16,dummyID
4,2.035106,5,train,18,1,2,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,16,dummyID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17363,2.322788,2133,train,77,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,22,dummyID
17364,3.837915,2134,train,77,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,22,dummyID
17366,3.781403,2136,train,78,1,2,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,dummyID
17368,3.239079,2138,train,80,1,2,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,22,dummyID


Unnamed: 0_level_0,dummyID
id_for_CV,Unnamed: 1_level_1
16,1.406497
16,2.322788
16,2.035106


Anzahl der Zeilen: 13894
Anzahl der targets: 1


### DDOP Models

In [None]:

# Execution starts here
combinations = [(9, 1), (7.5, 2.5), (5, 5), (2.5, 7.5), (1, 9)]
table_rows = []
random_state = 1
global_cv_results = []

# Initialize cvFolds
cvFolds = None  # Initialization

for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)

    # Parallelize column processing within each combination with n_jobs=4 to limit threads
    column_results = Parallel(n_jobs=1)(
        delayed(process_target_alldata)(column, cu, co, tau, y_train, X_train_features, X_test_features, y_test, random_state)
        for column in y_train.columns
    )

    # Combine results from all columns and print after each column
    for result in column_results:
        table_rows.extend(result)
        print(table_rows)
        # Convert the latest result to a DataFrame and print it
        result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
        print(result_table)  # Print the updated results after each column is processed

# Final result table after processing all combinations
result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])

# Construct the filename using the format "results_basicModels_{dataset_name}.csv"
filename = f"FULLDATASET_results_basic_Models_{dataset_name}.csv"
result_table.to_csv(filename, index=False)

print(f"Results saved as {filename}")

# Aggregate and save cross-validation results at the end of the entire workflow
if global_cv_results:
    # Concatenate all cross-validation results into a single DataFrame
    aggregated_cv_results_df = pd.concat(global_cv_results, ignore_index=True)

    # Print a summary of the aggregated cross-validation data to verify it looks correct
    print("Aggregated cross-validation results sample:")
    print(aggregated_cv_results_df.head(5))  # Print the first 5 rows as a sample

    # Save the aggregated results to a CSV file
    aggregated_cv_filename = f"FULLDATASET_cv_scores_basic_models_{dataset_name}.csv"
    aggregated_cv_results_df.to_csv(aggregated_cv_filename, index=False)
    print(f"Aggregated cross-validation results saved as {aggregated_cv_filename}")


### DRF+Leveset Models

In [4]:
timeseries = True
import scripts.config as config
from scripts.config import *
importlib.reload(config)

print(n_jobs)

# Execution starts here
combinations = [(9, 1), (7.5, 2.5), (5, 5), (2.5, 7.5), (1, 9)]

table_rows = []
random_state = 1
drf_cv_results = []
global_fold_scores = []

# Iterate over combinations and process them directly
for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)
    with threadpool_limits(limits=1):
            for column in y_train.columns:
                print(f"Processing column: {column}")

                              # Preprocess data
                X_train_scaled, X_test_scaled, y_train_col, y_test_col, X_train_scaled_withID, X_test_scaled_withID = preprocess_per_instance_alldata(
                    column, X_train_features, X_test_features, y_train, y_test
                )

                create_cv_folds_alldata(X_train_scaled_withID)
                
                # SAA model
                saa_model = SampleAverageApproximationNewsvendor(cu, co)
                saa_pred = saa_model.fit(y_train_col).predict(X_test_scaled.shape[0])


                # Ensure id_for_CV, y_true, and y_pred are 1-D arrays
                id_for_CV = X_test_scaled_withID['id_for_CV'].values.flatten()
                y_true = y_test_col.values.flatten()
                y_pred = saa_pred.flatten()  # Flatten y_pred to ensure it's 1-D

                # Create DataFrame for SAA predictions
                saa_predictions_df = pd.DataFrame({
                    'id_for_CV': id_for_CV,
                    'y_true': y_true,
                    'y_pred': y_pred  # Use the flattened y_pred here
                })

                saa_pinball_losses_per_id = {}
                grouped_saa = saa_predictions_df.groupby('id_for_CV')
                for id_val, group in grouped_saa:
                    y_true_id = group['y_true'].values
                    y_pred_id = group['y_pred'].values
                    pinball_loss_id = pinball_loss(y_true_id, y_pred_id, tau)
                    saa_pinball_losses_per_id[id_val] = pinball_loss_id
                    append_result(table_rows, id_val, cu, co, 'SAA', pinball_loss_id, 'N/A', np.nan, tau)

                n_features = X_train_scaled.shape[1]

                drf_model = DRFWrapper(min_node_size=10, num_trees=100, num_threads=2)
                drf_grid = get_grid('DRF', X_train_scaled.shape[1])
                
                print(f"Length of X_train_scaled: {len(X_train_scaled)}")
                print(f"Length of X_test_scaled: {len(X_test_scaled)}")
                #evaluate_and_append_models([('DRF', drf_model, drf_grid)], X_train_scaled, X_test_scaled, y_train_col, y_test_col, saa_pinball_losses_per_id, tau, cu, co, column, table_rows, timeseries, X_test_scaled_withID)


            
                if timeseries:
                    # Initialisiere LGBM und MLP-Modelle
                    lgbm_model = LGBMRegressor(random_state=random_state, n_jobs=n_jobs, verbosity=-1)
                    mlp_model = MLPRegressorWrapper(random_state=random_state, early_stopping=True)

                    # LGBM-Modell mit GroupSplitting evaluieren
                    lgbm_model_params = get_grid('LevelSetKDEx_groupsplit', X_train_scaled.shape[1])
                    print(lgbm_model_params)

                    lgbm_model_evaluation = [
                        ('LS_KDEx_LGBM', LevelSetKDEx(estimator=lgbm_model, binSize=100, weightsByDistance=False), lgbm_model_params)
                    ]
                    evaluate_and_append_models_alldata(lgbm_model_evaluation, X_train_scaled, X_test_scaled,
                                       y_train_col, y_test_col, saa_pinball_losses_per_id,
                                       tau, cu, co, column, table_rows, timeseries, X_test_scaled_withID)

                    # MLP-Modell mit GroupSplitting evaluieren
                    mlp_model_params = get_grid('LevelSetKDEx_groupsplit', X_train_scaled.shape[1])
                    mlp_model_evaluation = [
                        ('LS_KDEx_MLP', LevelSetKDEx(estimator=mlp_model, binSize=100, weightsByDistance=False), mlp_model_params)
                    ]
                    evaluate_and_append_models_alldata(mlp_model_evaluation, X_train_scaled, X_test_scaled,
                                       y_train_col, y_test_col, saa_pinball_losses_per_id,
                                       tau, cu, co, column, table_rows, timeseries, X_test_scaled_withID)


                else:
                    # Although the Wage data set is not a timeseries, it works similarly well on group timeseries splits. 
                    # To do this, we set shuffle = True once in the train/test split. 
                    # Resulting in an mixed order of the train/test points, even if we order it by the Index.
                    print("Only time series Data")

                # Print the table after evaluating each column
                second_result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
                print(second_result_table.tail(5))  # Print the last 5 rows of the table after each column is processed


# Final result table after processing all combinations
second_result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
filename = f"results_LevelsetModels_{dataset_name}.csv"


second_result_table.to_csv(filename, index=False)
# Define the folder where results will be saved
results_folder = "results"

from scripts.globals import global_fold_scores, global_cv_results, drf_cv_results


# Construct the filename and save it in the "results" folder
filename = os.path.join(results_folder, f"results_LevelsetModels_{dataset_name}.csv")
second_result_table.to_csv(filename, index=False)

print(f"Results saved as {filename}")

# Aggregating fold-wise cross-validation results
if global_fold_scores:  # No 'globals' function, just directly use 'global_fold_scores'
    # Reset multi-index for all fold scores
    global_fold_scores_flat = []
    for fold_scores_df in global_fold_scores:
        # Reset the multi-index so that the binSize and weightsByDistance become normal columns
        flat_df = fold_scores_df.reset_index()
        global_fold_scores_flat.append(flat_df)

    # Concatenate all fold-wise cross-validation results into a single DataFrame
    aggregated_fold_scores_df = pd.concat(global_fold_scores_flat, ignore_index=True)

    # Save the aggregated results to a CSV file in the "results" folder
    aggregated_fold_scores_filename = os.path.join(results_folder, f"cv_scores_levelset_models_{dataset_name}.csv")
    aggregated_fold_scores_df.to_csv(aggregated_fold_scores_filename, index=False)

    ### DRF DATA INSERTED INTO THE MAIN TABLE WHERE OTHER BAYES CVs ARE STORED
    aggregated_drf_cv_results_df = pd.concat(drf_cv_results, ignore_index=True)

    # Save the aggregated DRF results to a CSV file in the "results" folder
    aggregated_cv_filename = os.path.join(results_folder, f"cv_drf_scores_{dataset_name}.csv")
    aggregated_drf_cv_results_df.to_csv(aggregated_cv_filename, index=False)


1
Processing cu, co combination: cu=9, co=1
Processing column: dummyID
Wage Dataset, no time series split using basic KFold Cross Validation
[(array([ 2779,  2780,  2781, ..., 13891, 13892, 13893]), array([   0,    1,    2, ..., 2776, 2777, 2778])), (array([    0,     1,     2, ..., 13891, 13892, 13893]), array([2779, 2780, 2781, ..., 5555, 5556, 5557])), (array([    0,     1,     2, ..., 13891, 13892, 13893]), array([5558, 5559, 5560, ..., 8334, 8335, 8336])), (array([    0,     1,     2, ..., 13891, 13892, 13893]), array([ 8337,  8338,  8339, ..., 11113, 11114, 11115])), (array([    0,     1,     2, ..., 11113, 11114, 11115]), array([11116, 11117, 11118, ..., 13891, 13892, 13893]))]
Length of X_train_scaled: 13894
Length of X_test_scaled: 3476
{'binSize': [20, 100, 400], 'weightsByDistance': [True, False]}
Evaluating model: LS_KDEx_LGBM, cu: 9, co: 1
OrderedDict([('learning_rate', 0.1), ('max_depth', 5), ('min_data_in_leaf', 100), ('n_estimators', 100), ('num_leaves', 127)])
Evaluati

KeyboardInterrupt: 

In [None]:
# Filter die Tabelle, um nur die Zeilen mit dem Modell "SAA" anzuzeigen
saa_results = result_table[result_table['Model'] == 'SAA']

# Zeige die ersten 20 Zeilen der gefilterten Tabelle an
print(saa_results.head(20))