## Single ID Case

### load scripts

In [None]:
from scripts.shared_imports import *

# Verify that the current working directory has changed
print("Current working directory:", os.getcwd())

# Importiere alle Module
from scripts.get_data import get_dataset_settings_singleID, preprocess_data_singleID
from Wrapper.wrapper import DRFWrapper, MLPRegressorWrapper
from scripts.utils import *
from scripts.train_and_evaluate_singleID import (  
    append_result, evaluate_and_append_models_singleID, create_cv_folds_singleID,preprocess_per_instance_singleID
)
from scripts.process_target import process_target_singleID

import scripts.config as config
from scripts.globals import global_fold_scores, global_cv_results, drf_cv_results

# Lade alle Module neu
importlib.reload(config)

os.environ['OMP_NUM_THREADS'] = '1'  
os.environ['OPENBLAS_NUM_THREADS'] = '1'  
os.environ['MKL_NUM_THREADS'] = '1'  
os.environ['NUMEXPR_NUM_THREADS'] = '1'  
os.environ['VECLIB_MAXIMUM_THREADS'] = '1' 


### get data

In [None]:

importlib.reload(config)
dataset_name = config.dataset_name

# Hole die Datei-ID für den gewählten Datensatz
file_id = {
    'bakery': '1r_bDn9Z3Q_XgeTTkJL7352nUG3jkUM0z',
    'yaz': '1xrY3Uv5F9F9ofgSM7dVoSK4bE0gPMg36',
    'm5': '1tCBaxOgE5HHllvLVeRC18zvALBz6B-6w',
    'air': '1SKPpNxulcusNTjRwCC0p3C_XW7aNBNJZ',
    'wage': '1bn7E7NOoRzE4NwXXs1MYhRSKZHC13qYU',
}[config.dataset_name]

url = f"https://drive.google.com/uc?id={file_id}"
output = f"{dataset_name}.csv"
gdown.download(url, output, quiet=False)
data = pd.read_csv(output)

settings = get_dataset_settings_singleID(data)[dataset_name]

y, train_data, test_data, X_train_features, X_test_features, y_train, y_test = preprocess_data_singleID(
    data, settings['backscaling_columns'], settings['bool_columns'], settings['drop_columns'])

# note: 
# This is not yet the final data. Data is processed further in the preprocess_per_instance function before the models are trained

### process - DDOP Models + Levelset Estimator Models

Calculates the results for the ddop models and the Levelset estimator models

In [None]:
importlib.reload(config)

combinations = [(9, 1), (7.5, 2.5), (5, 5), (2.5, 7.5), (1, 9)]
table_rows = []
random_state = 42
cvFolds = None 

import scripts.globals as globals  # Import the globals module

for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)

    # Parallelize column processing within each combination
    column_results = Parallel(n_jobs=1)(  
        delayed(process_target_singleID)(column, cu, co, tau, y_train, X_train_features, X_test_features, y_test, random_state)
        for column in y_train.columns
    )

    # Combine results from all columns and print after each column
    for result in column_results:
        table_rows.extend(result)
        print(table_rows)
        result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
        print(result_table) 

result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])

# Define the folder where results will be saved
results_folder = "results"

# Check if the results folder exists, if not, create it
if not os.path.exists(results_folder):
    os.makedirs(results_folder)


# save Data
filename = os.path.join(results_folder, f"results_basic_Models_{dataset_name}.csv")
result_table.to_csv(filename, index=False)

# Aggregate and save cross-validation results at the end of the entire workflow
if globals.global_cv_results:
    aggregated_cv_results_df = pd.concat(globals.global_cv_results, ignore_index=True)
    aggregated_cv_filename = os.path.join(results_folder, f"cv_scores_basic_models_{dataset_name}.csv")
    aggregated_cv_results_df.to_csv(aggregated_cv_filename, index=False)

### process - DRF + Levelset Models

calculates the results for the DRF and takes the tuned hyperparamter of the Levelset estimators to create the LSX results

In [None]:
importlib.reload(config)

timeseries = True

# Execution starts here
combinations = [(1, 9)]
table_rows = []
random_state = 1
drf_cv_results = []
global_fold_scores = []

# Iterate over combinations and process them directly
for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)
    with threadpool_limits(limits=1):
            for column in y_train.columns:
                print(f"Processing column: {column}")

                # Preprocess data
                X_train_scaled, X_test_scaled, y_train_col, y_test_col, X_train_scaled_withID = preprocess_per_instance_singleID(
                    column, X_train_features, X_test_features, y_train, y_test
                )
                create_cv_folds_singleID(X_train_scaled_withID)

                # SAA model evaluation
                saa_model = SampleAverageApproximationNewsvendor(cu, co)
                saa_pred = saa_model.fit(y_train_col).predict(X_test_scaled.shape[0])
                saa_pinball_loss = pinball_loss(y_test_col.values.flatten(), saa_pred, tau)
                append_result(table_rows, column, cu, co, 'SAA', saa_pinball_loss, 'N/A', np.nan, tau)

            
            
                if timeseries:
                    # Initialisiere LGBM und MLP-Modelle
                    lgbm_model = LGBMRegressor(random_state=random_state, n_jobs=1, verbosity=-1)
                    mlp_model = MLPRegressorWrapper(random_state=random_state, early_stopping=True)

                    lgbm_model_params = get_grid('LevelSetKDEx_groupsplit', X_train_scaled.shape[1])
                    lgbm_model_evaluation = [
                        ('LS_KDEx_LGBM', LevelSetKDEx(estimator=lgbm_model, binSize=100, weightsByDistance=False), lgbm_model_params)
                    ]
                    evaluate_and_append_models_singleID(lgbm_model_evaluation, X_train_scaled, X_test_scaled, y_train_col, y_test_col, saa_pinball_loss, tau, cu, co, column, table_rows, timeseries)

                    mlp_model_params = get_grid('LevelSetKDEx_groupsplit', X_train_scaled.shape[1])
                    mlp_model_evaluation = [
                        ('LS_KDEx_MLP', LevelSetKDEx(estimator=mlp_model, binSize=100, weightsByDistance=False), mlp_model_params)
                    ]
                    evaluate_and_append_models_singleID(mlp_model_evaluation, X_train_scaled, X_test_scaled, y_train_col, y_test_col, saa_pinball_loss, tau, cu, co, column, table_rows, timeseries)


                else:
                    # Placeholders resulting from previous changes - can be ignored. 
                    # Separate CV for non time series (Wage Dataset) moved to the create_cv_folds function.
                    print("set timeseries varible to True")

                # DRF-Modell wird immer ausgeführt
                drf_model = DRFWrapper(min_node_size=10, num_trees=100, num_threads=1)
                drf_grid = get_grid('DRF', X_train_scaled.shape[1])
                evaluate_and_append_models_singleID([('DRF', drf_model, drf_grid)], X_train_scaled, X_test_scaled, y_train_col, y_test_col, saa_pinball_loss, tau, cu, co, column, table_rows, timeseries)


                # Print the table after evaluating each column
                second_result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
                print(second_result_table.tail(5))  


# Define the folder where results will be saved
results_folder = "results"

# Check if the results folder exists, if not, create it
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

# Final result table after processing all combinations
second_result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])


# safe d

filename = os.path.join(results_folder, f"results_LevelsetModels_{dataset_name}.csv")
second_result_table.to_csv(filename, index=False)

print(f"Results saved as {filename}")


if global_fold_scores:  

    global_fold_scores_flat = []
    for fold_scores_df in global_fold_scores:

        flat_df = fold_scores_df.reset_index()
        global_fold_scores_flat.append(flat_df)

    aggregated_fold_scores_df = pd.concat(global_fold_scores_flat, ignore_index=True)

    aggregated_fold_scores_filename = os.path.join(results_folder, f"cv_scores_levelset_models_{dataset_name}.csv")
    aggregated_fold_scores_df.to_csv(aggregated_fold_scores_filename, index=False)

    aggregated_drf_cv_results_df = pd.concat(drf_cv_results, ignore_index=True)

    aggregated_cv_filename = os.path.join(results_folder, f"cv_drf_scores_{dataset_name}.csv")
    aggregated_drf_cv_results_df.to_csv(aggregated_cv_filename, index=False)


## FULL Dataset Cases


In [None]:
from scripts.shared_imports import *

# Verify that the current working directory has changed
print("Current working directory:", os.getcwd())

# Importiere alle Module
from scripts.get_data import get_dataset_settings_alldata, preprocess_data_alldata
from Wrapper.wrapper import DRFWrapper, MLPRegressorWrapper
from scripts.utils import *
from scripts.train_and_evaluate_alldata import (  
    evaluate_and_append_models_alldata, create_cv_folds_alldata, preprocess_per_instance_alldata
)
from scripts.process_target import process_target_alldata

import scripts.config as config
from scripts.globals import global_fold_scores, global_cv_results, drf_cv_results

importlib.reload(config)

os.environ['OMP_NUM_THREADS'] = '1'  # OpenMP Threads auf 4 beschränken
os.environ['OPENBLAS_NUM_THREADS'] = '1'  # Für OpenBLAS
os.environ['MKL_NUM_THREADS'] = '1'  # Für Intel MKL (falls verwendet)
os.environ['NUMEXPR_NUM_THREADS'] = '1'  # Für NumExpr
os.environ['VECLIB_MAXIMUM_THREADS'] = '1'  # Für MacOS Accelerate



In [None]:
importlib.reload(config)
dataset_name = config.dataset_name

# Hole die Datei-ID für den gewählten Datensatz
file_id = {
    'subset_bakery': '1r_bDn9Z3Q_XgeTTkJL7352nUG3jkUM0z',
    'yaz': '1xrY3Uv5F9F9ofgSM7dVoSK4bE0gPMg36',
    'subset_m5': '1tCBaxOgE5HHllvLVeRC18zvALBz6B-6w',
    'subset_air': '1DMOaV92n3BFEGeCubaxEys2eLzg2Cic3',
    'wage': '1bn7E7NOoRzE4NwXXs1MYhRSKZHC13qYU',
}[config.dataset_name]


url = f"https://drive.google.com/uc?id={file_id}"


# Datei herunterladen
output = f"{dataset_name}.csv"
gdown.download(url, output, quiet=False)
data = pd.read_csv(output)
settings = get_dataset_settings_alldata(data)[dataset_name]

y, train_data, test_data, X_train_features, X_test_features, y_train, y_test, data, dataset_name  = preprocess_data_alldata(
    data=data,
    dataset_name=dataset_name,
    bool_columns=settings['bool_columns'],
    drop_columns=settings['drop_columns'],
    drop_keywords=settings['drop_keywords'],
)

# note: 
# This is not yet the final data. Data is processed further in the preprocess_per_instance function before the models are trained

### DDOP Models

In [None]:
importlib.reload(config)
# Execution starts here
combinations = [(9, 1), (7.5, 2.5), (5, 5), (2.5, 7.5), (1, 9)]
table_rows = []
random_state = 1
global_cv_results = []

# Initialize cvFolds
cvFolds = None  # Initialization

for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)

    column_results = Parallel(n_jobs=1)(
        delayed(process_target_alldata)(column, cu, co, tau, y_train, X_train_features, X_test_features, y_test, random_state)
        for column in y_train.columns
    )

    for result in column_results:
        table_rows.extend(result)
        print(table_rows)
        # Convert the latest result to a DataFrame and print it
        result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
        print(result_table)  # Print the updated results after each column is processed

# Final result table after processing all combinations
result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])


# safe data
filename = f"FULLDATASET_results_basic_Models_{dataset_name}.csv"
result_table.to_csv(filename, index=False)

print(f"Results saved as {filename}")

# Aggregate and save cross-validation results at the end of the entire workflow
if global_cv_results:

    aggregated_cv_results_df = pd.concat(global_cv_results, ignore_index=True)

    aggregated_cv_filename = f"FULLDATASET_cv_scores_basic_models_{dataset_name}.csv"
    aggregated_cv_results_df.to_csv(aggregated_cv_filename, index=False)

### DRF+Leveset Models

In [None]:
timeseries = True
import scripts.config as config
from scripts.config import *
importlib.reload(config)

combinations = [(9, 1), (7.5, 2.5), (5, 5), (2.5, 7.5), (1, 9)]
table_rows = []
random_state = 1
drf_cv_results = []
global_fold_scores = []

# Iterate over combinations and process them directly
for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)
    with threadpool_limits(limits=1):
            for column in y_train.columns:
                print(f"Processing column: {column}")


                X_train_scaled, X_test_scaled, y_train_col, y_test_col, X_train_scaled_withID, X_test_scaled_withID = preprocess_per_instance_alldata(
                    column, X_train_features, X_test_features, y_train, y_test
                )

                create_cv_folds_alldata(X_train_scaled_withID)
                
                # SAA model
                saa_model = SampleAverageApproximationNewsvendor(cu, co)
                saa_pred = saa_model.fit(y_train_col).predict(X_test_scaled.shape[0])


                # Ensure id_for_CV, y_true, and y_pred are 1-D arrays
                id_for_CV = X_test_scaled_withID['id_for_CV'].values.flatten()
                y_true = y_test_col.values.flatten()
                y_pred = saa_pred.flatten()  # Flatten y_pred to ensure it's 1-D

                # Create DataFrame for SAA predictions
                saa_predictions_df = pd.DataFrame({
                    'id_for_CV': id_for_CV,
                    'y_true': y_true,
                    'y_pred': y_pred  # Use the flattened y_pred here
                })

                saa_pinball_losses_per_id = {}
                grouped_saa = saa_predictions_df.groupby('id_for_CV')
                for id_val, group in grouped_saa:
                    y_true_id = group['y_true'].values
                    y_pred_id = group['y_pred'].values
                    pinball_loss_id = pinball_loss(y_true_id, y_pred_id, tau)
                    saa_pinball_losses_per_id[id_val] = pinball_loss_id
                    append_result(table_rows, id_val, cu, co, 'SAA', pinball_loss_id, 'N/A', np.nan, tau)

                n_features = X_train_scaled.shape[1]

                drf_model = DRFWrapper(min_node_size=10, num_trees=100, num_threads=1)
                drf_grid = get_grid('DRF', X_train_scaled.shape[1])
                
                print(f"Length of X_train_scaled: {len(X_train_scaled)}")
                print(f"Length of X_test_scaled: {len(X_test_scaled)}")
                evaluate_and_append_models_alldata([('DRF', drf_model, drf_grid)], X_train_scaled, X_test_scaled, y_train_col, y_test_col, saa_pinball_losses_per_id, tau, cu, co, column, table_rows, timeseries, X_test_scaled_withID)


            
                if timeseries:
                    # Initialisiere LGBM und MLP-Modelle
                    lgbm_model = LGBMRegressor(random_state=random_state, n_jobs=n_jobs, verbosity=-1)
                    mlp_model = MLPRegressorWrapper(random_state=random_state, early_stopping=True)

                    # LGBM-Modell mit GroupSplitting evaluieren
                    lgbm_model_params = get_grid('LevelSetKDEx_groupsplit', X_train_scaled.shape[1])
                    print(lgbm_model_params)

                    lgbm_model_evaluation = [
                        ('LS_KDEx_LGBM', LevelSetKDEx(estimator=lgbm_model, binSize=100, weightsByDistance=False), lgbm_model_params)
                    ]
                    evaluate_and_append_models_alldata(lgbm_model_evaluation, X_train_scaled, X_test_scaled,
                                       y_train_col, y_test_col, saa_pinball_losses_per_id,
                                       tau, cu, co, column, table_rows, timeseries, X_test_scaled_withID)

                    # MLP-Modell mit GroupSplitting evaluieren
                    mlp_model_params = get_grid('LevelSetKDEx_groupsplit', X_train_scaled.shape[1])
                    mlp_model_evaluation = [
                        ('LS_KDEx_MLP', LevelSetKDEx(estimator=mlp_model, binSize=100, weightsByDistance=False), mlp_model_params)
                    ]
                    evaluate_and_append_models_alldata(mlp_model_evaluation, X_train_scaled, X_test_scaled,
                                       y_train_col, y_test_col, saa_pinball_losses_per_id,
                                       tau, cu, co, column, table_rows, timeseries, X_test_scaled_withID)


                else:
                    # Placeholders resulting from previous changes - can be ignored. 
                    # Separate CV for non time series (Wage Dataset) moved to the create_cv_folds function.
                    print("set timeseries varible to True")

                second_result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
                print(second_result_table.tail(5))  # Print the last 5 rows of the table after each column is processed


# Final result table after processing all combinations
second_result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])


# safe data
filename = f"FULLDATASET_results_LevelsetModels_{dataset_name}.csv"

second_result_table.to_csv(filename, index=False)
results_folder = "results"

from scripts.globals import global_fold_scores, global_cv_results, drf_cv_results

filename = os.path.join(results_folder, f"FULLDATASET_results_LevelsetModels_{dataset_name}.csv")
second_result_table.to_csv(filename, index=False)

print(f"Results saved as {filename}")

if global_fold_scores: 

    global_fold_scores_flat = []
    for fold_scores_df in global_fold_scores:
        
        flat_df = fold_scores_df.reset_index()
        global_fold_scores_flat.append(flat_df)

    aggregated_fold_scores_df = pd.concat(global_fold_scores_flat, ignore_index=True)

    aggregated_fold_scores_filename = os.path.join(results_folder, f"FULLDATASET_cv_scores_levelset_models_{dataset_name}.csv")
    aggregated_fold_scores_df.to_csv(aggregated_fold_scores_filename, index=False)

    aggregated_drf_cv_results_df = pd.concat(drf_cv_results, ignore_index=True)

    aggregated_cv_filename = os.path.join(results_folder, f"cv_drf_scores_{dataset_name}.csv")
    aggregated_drf_cv_results_df.to_csv(aggregated_cv_filename, index=False)
