Parameters for the run function is follows:

- `train`: A pandas DataFrame containing the training dataset.
- `test`: A pandas DataFrame containing the test dataset.
- `target_label`: Target label to predict.
- `features`: A list of features to train on.
- `depth`: Maximum depth of the decision tree (default is 2).
- `criterion`: Splitting criterion for the decision tree can be "misclassification" or "gini"(default is "gini").
- `time_limit`: Time limit for training in seconds (default is 1800).
- `big_m`: Value of big M used in the optimization model (default is 99).

In [29]:
import pandas as pd
import time
import os

from helpers.helpers import preprocess_numerical, move_targets_to_front_and_rename, make_data_binary


In [None]:
target_label = "y"
depth_rolling_tree = 8
criterion_loss = "gini"
#criterion_loss = "misclassification"


# add file savings according to loss criteria (irrelevant if we use hybrid-RST)


In [31]:
#organizing results in folders and txt/csv files

dataset_name = 'adult' # folder in 'results' will be named after it, results/dataset_name contains result .txt and .csv

dir_path = f'results/{dataset_name}'
file_path = f'{dir_path}/output_depth_'

# Create the directory if it doesn't exist
os.makedirs(dir_path, exist_ok=True)

In [32]:
if dataset_name == 'test': 

    # Load your training and test datasets
    train_data = pd.read_csv("datasets/example_datasets/train.csv")
    test_data = pd.read_csv("datasets/example_datasets/test.csv")

    with open(f'{file_path}all_{dataset_name}.txt', 'w') as f:
        f.write("Test simple dataset \n")

In [33]:
if dataset_name == 'adult':

    # Load your training and test datasets
    train_data = pd.read_csv("datasets/adult/adult.data", sep=',', skipinitialspace=True, header=None) #32561 rows
    test_data = pd.read_csv("datasets/adult/adult.test", sep=',', skipinitialspace=True, header=None) #16281 rows

 
    with open(f'{file_path}all_{dataset_name}.txt', 'w') as f:
        f.write("Test adult dataset \n")

    # Remove dots from the 'target' column
    test_data[14] = test_data[14].astype(str).str.replace('.', '', regex=False)

    len_train_data = len(train_data)

    stacked = pd.concat([train_data, test_data ], ignore_index=False)
    #print(stacked)


    stacked = preprocess_numerical(stacked)
    stacked = move_targets_to_front_and_rename(data= stacked, target_label=14)

    unique_values = stacked ['y'].unique()
    print(unique_values) # hier sieht man dann, dass es einen zusätzlichen punkt bei den target labels in adult.test gibt; also ohne löschen des punktes



    stacked = make_data_binary(stacked)
    print(stacked)




    train_data_bin = stacked.iloc[:len_train_data] #[32561 rows x 125 columns]
    test_data_bin = stacked.iloc[len_train_data:] #[16281 rows x 125 columns]

    train_data=train_data_bin
    test_data=test_data_bin

['<=50K' '>50K']
       y  1  2  3  4  5  6  7  8  9  ...  115  116  117  118  119  120  121  \
0      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
1      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
2      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
3      1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
4      1  0  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
...   .. .. .. .. .. .. .. .. .. ..  ...  ...  ...  ...  ...  ...  ...  ...   
16276  1  0  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
16277  1  1  0  0  1  0  1  0  0  0  ...    0    0    0    0    0    0    0   
16278  1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
16279  1  1  0  1  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   
16280  2  1  1  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0   

       122  123  124  
0        1 

In [34]:
# get features
feature_columns = train_data.columns[1:] #assuming labels are in first column, ensured trough move_targets_to_front_and_rename()
#print(feature_columns)

# Solving with pulp

In [35]:
#%load_ext snakeviz

# solving with pulp

from rolling_lookahead_dt_pulp import rollo_oct_pulp

start_time_pulp = time.time()



# Run the classifier using pulp
result_dict_pulp =rollo_oct_pulp.run(train=train_data, test=test_data, target_label="y", features=feature_columns, depth=depth_rolling_tree, criterion=criterion_loss)

# %snakeviz -t rollo_oct_pulp.run(train=train_data, test=test_data, target_label="y", features=feature_columns, depth=depth_rolling_tree, criterion=criterion_loss)
end_time_pulp = time.time()
#print(f"Pulp execution time for depth {depth_rolling_tree} : {end_time_pulp - start_time_pulp} seconds")

{'leaf_nodes': [4, 5, 6, 7], 'leaf_nodes_path': {4: [1, 1], 5: [1, 0], 6: [0, 1], 7: [0, 0]}}
Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/drood/Obsidian/Files/Bachelorarbeit/rlrf_my_try/.venv/lib/python3.12/site-packages/pulp/apis/../solverdir/cbc/linux/i64/cbc /tmp/6e883b1dc0ad4e31ae5dfc9e792493ff-pulp.mps -sec 1800 -timeMode elapsed -branch -printingOptions all -solution /tmp/6e883b1dc0ad4e31ae5dfc9e792493ff-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 131 COLUMNS
At line 153650 RHS
At line 153777 BOUNDS
At line 184530 ENDATA
Problem MODEL has 126 rows, 30752 columns and 61504 elements
Coin0008I MODEL read with 0 errors
seconds was changed from 1e+100 to 1800
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0.259777 - 0.01 seconds
Cgl0004I processed model has 0 rows, 0 columns (0 integer (0 of which binary)) and 0 elements
Cbc3007W No integer variables - nothing to d

In [36]:
for depth in range(2,depth_rolling_tree+1):
    print(result_dict_pulp[depth])

print(f"Pulp execution time for depth {depth_rolling_tree} : {end_time_pulp - start_time_pulp} seconds\n")

for depth in range(2,depth_rolling_tree+1):
    with open(f'{file_path}all_{dataset_name}.txt', 'a') as f:
        f.write(str(depth) + ': ' + str(result_dict_pulp[depth]) + "\n")

with open(f'{file_path}all_{dataset_name}.txt', 'a') as f:
        f.write(f"Pulp execution time for depth {depth_rolling_tree} : {end_time_pulp - start_time_pulp} seconds\n")



{'training_accuracy': 0.8193544424311293, 'test_accuracy': 0.8219396842945765, 'time': 171.91448783874512}
{'training_accuracy': 0.8215349651423483, 'test_accuracy': 0.8247650635710337, 'time': 174.23762464523315}
{'training_accuracy': 0.824575412303062, 'test_accuracy': 0.8259934893434064, 'time': 168.7406747341156}
{'training_accuracy': 0.8282300912134148, 'test_accuracy': 0.8302315582580923, 'time': 158.07800889015198}
{'training_accuracy': 0.8334510610853475, 'test_accuracy': 0.8313371414532277, 'time': 146.61753630638123}
{'training_accuracy': 0.8378735296827493, 'test_accuracy': 0.8325041459369817, 'time': 152.65479850769043}
{'training_accuracy': 0.843770154479285, 'test_accuracy': 0.8311528775873718, 'time': 160.2932095527649}
Pulp execution time for depth 8 : 1132.5926575660706 seconds



In [37]:
for depth in range(2,depth_rolling_tree+1):
    with open(f'{file_path}{depth}_classification_{dataset_name}_test_pulp.csv', 'w') as f:
        f.write(str(result_dict_pulp['tree'][depth]['test'].to_csv()))
    with open(f'{file_path}{depth}_classification_{dataset_name}_train_pulp.csv', 'w') as f:
        f.write(str(result_dict_pulp['tree'][depth]['train'].to_csv()))


# Solving with gurobi

In [38]:
"""
# solving with gurobi
from rolling_lookahead_dt_gurobi import rollo_oct_gurobi

start_time_gurobi = time.time()

# Run the classifier using pulp
result_dict_gurobi, result_df_test_data_gurobi, result_df_training_data_gurobi = rollo_oct_gurobi.run(
                                                                train=train_data,
                                                                test=test_data,
                                                                target_label="y",
                                                                features=feature_columns,
                                                                depth=depth_rolling_tree,
                                                                criterion=criterion_loss
)
end_time_gurobi = time.time()


SyntaxError: incomplete input (77758373.py, line 1)

In [None]:
with open(f'{file_path}_{dataset_name}.txt', 'a') as f:
    f.write(f"Gurobi execution time for depth {depth_rolling_tree} : {end_time_gurobi - start_time_gurobi} seconds\n")

print(f"Gurobi execution time for depth {depth_rolling_tree} : {end_time_gurobi - start_time_gurobi} seconds\n")

Gurobi execution time for depth 4 : 9.866874694824219 seconds



In [None]:
with open(f'{file_path}_{dataset_name}.txt', 'a') as f:
    f.write(str(result_dict_gurobi) + "\n")
print(result_dict_gurobi)

{3: {'training_accuracy': 0.69375, 'test_accuracy': 0.7777777777777778, 'time': 2.646049976348877}, 4: {'training_accuracy': 0.75, 'test_accuracy': 0.8333333333333334, 'time': 4.534008264541626}, 2: {'training_accuracy': 0.5875, 'test_accuracy': 0.6111111111111112, 'time': 9.863209247589111}}


In [None]:
with open(f'{file_path}_classification_{dataset_name}_test_gurobi.csv', 'w') as f:
    f.write(str(result_df_test_data_gurobi.to_csv()))

In [None]:
with open(f'{file_path}_classification_{dataset_name}_train_gurobi.csv', 'w') as f:
    f.write(str(result_df_training_data_gurobi.to_csv()))

# Vergleich Ergebnisse

In [None]:
#print(result_dict_gurobi)

In [None]:
#print(result_df_test_data_pulp)

    y  prediction  leaf
0   1           1    13
1   1           1    13
2   1           2    31
3   1           2    31
4   1           2    31
5   1           1    13
6   2           2    12
7   2           2    31
8   2           2    31
9   2           2    28
10  2           2    31
11  2           2    31
12  2           2    31
13  3           3    30
14  3           3     5
15  3           3    29
16  3           3     5
17  3           3     5


In [None]:
#print(result_df_test_data_gurobi)

    y  prediction  leaf
0   1           1    13
1   1           1    13
2   1           2    31
3   1           2    31
4   1           2    31
5   1           1    13
6   2           2    12
7   2           2    31
8   2           2    31
9   2           2    28
10  2           2    31
11  2           2    31
12  2           2    31
13  3           3    30
14  3           3     5
15  3           3    29
16  3           3     5
17  3           3     5


In [None]:
#print(result_df_training_data_pulp)

     y  prediction  leaf
0    1           2    31
1    1           2    31
2    1           2    31
3    1           1    13
4    1           1    13
..  ..         ...   ...
155  3           3     5
156  3           3     5
157  3           3    30
158  3           3    30
159  3           3     5

[160 rows x 3 columns]


In [None]:
#print(result_df_training_data_gurobi)

     y  prediction  leaf
0    1           2    31
1    1           2    31
2    1           2    31
3    1           1    13
4    1           1    13
..  ..         ...   ...
155  3           3     5
156  3           3     5
157  3           3    30
158  3           3    30
159  3           3     5

[160 rows x 3 columns]
