In [1]:
print("main")

main


### load Packages

In [2]:
# Standard library imports
import os
import random
import warnings
import logging

# Third-party library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tensorflow as tf
import gdown
import rpy2.robjects as ro
from rpy2.rinterface import RRuntimeWarning
from rpy2.rinterface_lib.callbacks import logger as rpy2_logger
from pulp import LpSolverDefault
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor
# Custom or external package imports
from ddop2.newsvendor import (
    DecisionTreeWeightedNewsvendor, KNeighborsWeightedNewsvendor, 
    SampleAverageApproximationNewsvendor, DeepLearningNewsvendor, 
    RandomForestWeightedNewsvendor, GaussianWeightedNewsvendor, 
    LinearRegressionNewsvendor
)
from drf import drf
from dddex.levelSetKDEx_univariate import LevelSetKDEx
from dddex.loadData import loadDataYaz
from dddex.crossValidation import QuantileCrossValidation, groupedTimeSeriesSplit
from joblib import Parallel, delayed
import pandas as pd
from threadpoolctl import threadpool_limits  # Importiere threadpool_limits


# Set pandas display options
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full column width
pd.set_option('display.max_rows', 10)  # Limit the number of displayed rows
pd.set_option('display.width', 1000)  # Set high enough width to show all columns in a line

# Suppress warnings and logging
warnings.filterwarnings("ignore")  # Suppress all Python warnings
rpy2_logger.setLevel(logging.CRITICAL)  # Only show critical messages from R

# Set R options to suppress warnings and messages
ro.r('while (sink.number() > 0) sink(NULL)')  # Close open sinks to avoid "sink stack full" errors
ro.r('options(warn=-1)')  # Disable all warnings in R
ro.r('suppressMessages(suppressWarnings(library("drf")))')  # Suppress R package messages and warnings

# Set environment variables for R libraries
os.environ['R_LIBS_USER'] = '/usr/lib/R/site-library'
os.environ['R_HOME'] = '/usr/lib/R'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Set random seeds
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
tf.get_logger().setLevel(logging.ERROR)

# Deactivate CBC Solver output
LpSolverDefault.msg = False  # Deactivates the CBC Solver output

# Verify that the current working directory has changed
print("Current working directory:", os.getcwd())

2024-10-15 20:23:10.453361: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-15 20:23:10.455200: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-15 20:23:10.509953: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-15 20:23:10.511285: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Current working directory: /root/WorkingFolder


In [3]:
from scripts.get_data import get_dataset_settings, preprocess_data

from Wrapper.wrapper import DRFWrapper, MLPRegressorWrapper, LevelSetKDExWrapper

from scripts.cv_and_evaluation import pinball_loss, pinball_loss_scorer, get_grid, preprocess_per_instance, train_and_evaluate_model, calculate_n_iter, bayesian_search_model, preprocess_per_instance, append_result, evaluate_and_append_models, create_cv_folds

from scripts.process_target import process_column

### get data

In [4]:
dataset_name = 'air'  # Hier den Namen des gewünschten Datensatzes b

In [5]:
# Hole die Datei-ID für den gewählten Datensatz
file_id = {
    'bakery': '1r_bDn9Z3Q_XgeTTkJL7352nUG3jkUM0z',
    'yaz': '1xrY3Uv5F9F9ofgSM7dVoSK4bE0gPMg36',
    'm5': '1tCBaxOgE5HHllvLVeRC18zvALBz6B-6w',
    'sid': '1J9bPCfeLDH-mbSnvTHRoCva7pl6cXD3_',
    'air': '1SKPpNxulcusNTjRwCC0p3C_XW7aNBNJZ',
    "copula": '1H5wdJgmxdhbzeS17w0NkRlHRCESEAd-e',
    'wage': '1bn7E7NOoRzE4NwXXs1MYhRSKZHC13qYU',
}[dataset_name]


url = f"https://drive.google.com/uc?id={file_id}"


# Datei herunterladen
output = f"{dataset_name}.csv"
gdown.download(url, output, quiet=False)
data = pd.read_csv(output)

# Erstelle die Dataset-Einstellungen basierend auf den geladenen Daten
settings = get_dataset_settings(data)[dataset_name]

y, train_data, test_data, X_train_features, X_test_features, y_train, y_test = preprocess_data(
    data, settings['backscaling_columns'], settings['bool_columns'], settings['drop_columns'])


display(X_train_features.head(10))
display(y_train.head(3))

print("Anzahl der targets:", len(y_train.columns))




Downloading...
From: https://drive.google.com/uc?id=1SKPpNxulcusNTjRwCC0p3C_XW7aNBNJZ
To: /root/WorkingFolder/air.csv
100%|██████████| 26.0M/26.0M [00:00<00:00, 84.2MB/s]


Unnamed: 0,Longitude,Latitude,Elevation,Weekday,Month,Time,dayIndex,Land.Use_AGRICULTURAL,Land.Use_COMMERCIAL,Land.Use_FOREST,Land.Use_INDUSTRIAL,Location.Setting_SUBURBAN,Location.Setting_URBAN AND CENTER CITY,demand,id,label
0,-106.585200,35.134300,1591.0,1,1,4,1,0,0,0,0,0,1,0.9,Location_1_max_CO,train
1,-104.778334,41.182227,1842.0,1,1,4,2,0,0,0,0,1,0,0.1,Location_1_max_CO,train
2,-112.095767,33.503833,343.0,1,1,11,3,0,0,0,0,0,1,0.9,Location_1_max_CO,train
3,-106.585200,35.134300,1591.0,1,1,11,4,0,0,0,0,0,1,0.6,Location_1_max_CO,train
4,-111.872222,40.736389,1304.0,1,1,11,5,0,0,0,0,1,0,1.1,Location_1_max_CO,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201402,-93.207615,45.137680,277.0,1,1,11,6,0,1,0,0,1,0,2.6,Location_6_max_SO2,train
201403,-90.198348,38.656429,155.0,1,1,11,7,0,0,0,0,0,1,4.5,Location_6_max_SO2,train
201404,-96.855350,46.933754,271.0,1,1,11,8,1,0,0,0,1,0,0.7,Location_6_max_SO2,train
201405,-96.700769,43.547920,449.0,1,1,11,9,0,1,0,0,0,1,0.9,Location_6_max_SO2,train


id,Location_1_max_CO,Location_1_max_NO2,Location_1_max_O3,Location_1_max_PM10,Location_1_max_PM2.5,Location_1_max_SO2,Location_2_max_CO,Location_2_max_NO2,Location_2_max_O3,Location_2_max_PM10,Location_2_max_PM2.5,Location_2_max_SO2,Location_3_max_CO,Location_3_max_NO2,Location_3_max_O3,Location_3_max_PM10,Location_3_max_PM2.5,Location_3_max_SO2,Location_4_max_CO,Location_4_max_NO2,Location_4_max_O3,Location_4_max_PM10,Location_4_max_PM2.5,Location_4_max_SO2,Location_5_max_CO,Location_5_max_NO2,Location_5_max_O3,Location_5_max_PM10,Location_5_max_PM2.5,Location_5_max_SO2,Location_6_max_CO,Location_6_max_NO2,Location_6_max_O3,Location_6_max_PM10,Location_6_max_PM2.5,Location_6_max_SO2
dayIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
1,0.9,41.6,0.022,39.0,24.3,4.1,0.1,10.0,0.033,9.0,4.85,5.0,0.4,24.4,0.025,16.0,9.6,4.2,1.5,50.6,0.008,47.0,17.6,4.3,0.3,28.6,0.019,10.0,9.55,0.2,0.4,35.0,0.017,17.0,8.6,2.5
2,0.1,31.9,0.039,8.0,6.5,0.4,0.2,6.0,0.031,10.0,5.7,2.0,0.3,24.6,0.029,9.0,4.9,2.6,1.4,31.6,0.025,39.0,31.933333,0.9,0.4,23.1,0.008,13.0,13.6,0.1,0.2,14.2,0.024,17.0,4.1,1.3
3,0.9,36.0,0.023,18.0,11.1,2.6,0.4,11.0,0.036,7.0,8.7,2.0,0.3,19.6,0.012,19.0,12.35,6.8,0.5,22.0,0.024,21.0,7.4,1.0,0.2,6.1,0.028,11.0,7.8,2.1,0.3,17.4,0.011,17.0,3.7,1.0


Anzahl der targets: 36


### settings

In [6]:
os.environ["OMP_NUM_THREADS"] = "2"
os.environ["MKL_NUM_THREADS"] = "20"
os.environ["OPENBLAS_NUM_THREADS"] = "3"

### process

In [7]:

# Execution starts here
combinations = [(9, 1), (7.5, 2.5), (5, 5), (2.5, 7.5), (1, 9)]
table_rows = []
random_state = 42

# Initialize cvFolds
cvFolds = None  # Initialization


import scripts.globals as globals  # Import the globals module

for cu, co in combinations:
    print(f"Processing cu, co combination: cu={cu}, co={co}")
    tau = cu / (cu + co)

    # Parallelize column processing within each combination with n_jobs=4 to limit threads
    column_results = Parallel(n_jobs=1)(  
        delayed(process_column)(column, cu, co, tau, y_train, X_train_features, X_test_features, y_test, random_state)
        for column in y_train.columns
    )

    # Combine results from all columns and print after each column
    for result in column_results:
        table_rows.extend(result)
        print(table_rows)
        # Convert the latest result to a DataFrame and print it
        result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])
        print(result_table)  # Print the updated results after each column is processed

# Final result table after processing all combinations
result_table = pd.DataFrame(table_rows, columns=['Variable', 'cu', 'co', 'Model', 'Pinball Loss', 'Best Params', 'delta C', 'sl'])

# Construct the filename using the format "results_basicModels_{dataset_name}.csv"
filename = f"results_basic_Models_{dataset_name}.csv"

# Save the result table to a CSV file
result_table.to_csv(filename, index=False)

print(f"Results saved as {filename}")

# Aggregate and save cross-validation results at the end of the entire workflow
if globals.global_cv_results:
    # Concatenate all cross-validation results into a single DataFrame
    aggregated_cv_results_df = pd.concat(globals.global_cv_results, ignore_index=True)

    # Print a summary of the aggregated cross-validation data to verify it looks correct
    print("Aggregated cross-validation results sample:")
    print(aggregated_cv_results_df.head(5))  # Print the first 5 rows as a sample

    # Save the aggregated results to a CSV file
    aggregated_cv_filename = f"cv_scores_basic_models_{dataset_name}.csv"
    aggregated_cv_results_df.to_csv(aggregated_cv_filename, index=False)
    print(f"Aggregated cross-validation results saved as {aggregated_cv_filename}")


Processing cu, co combination: cu=9, co=1
Test length for column: 186 (20% of 931.0)
Running model MLP for column Location_1_max_CO, cu=9, co=1
Evaluating model: MLP, cu: 9, co: 1
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Cross-validation results for MLP with cu=9, co=1:
Running model LGBM for column Location_1_max_CO, cu=9, co=1
Evaluating model: LGBM, cu: 9, co: 1
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fi

KeyboardInterrupt: 


R[write to console]: 1: 
R[write to console]: In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
R[write to console]: 
 
R[write to console]:  library ‘/usr/lib/R/site-library’ contains no packages

R[write to console]: 2: 
R[write to console]: In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
R[write to console]: 
 
R[write to console]:  library ‘/usr/lib/R/site-library’ contains no packages


R[write to console]: 1: 
R[write to console]: In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
R[write to console]: 
 
R[write to console]:  library ‘/usr/lib/R/site-library’ contains no packages

R[write to console]: 2: 
R[write to console]: In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
R[write to console]: 
 
R[write to console]:  library ‘/usr/lib/R/site-library’ contains no packages


R[write to console]: 1: 
R[write to console]: In (function (package, help, pos = 