In [1]:
import pandas as pd
import os
from pathlib import Path
import glob
import sys
import os
sys.path.append(os.path.abspath("src"))
sys.path.append(os.path.abspath("src/torch"))
from src.data_prep import *

In [8]:
def clean_up_csv(csv_fp):
    my_df = pd.read_csv(csv_fp, on_bad_lines='skip')
    my_df = my_df.loc[:, ~my_df.columns.str.startswith('Unnamed')]
    my_df = my_df[my_df["cpu"] > -1.1]#removes rows where the solver completely failed
    my_df = my_df[~my_df["bin_lb"].isna()]

    my_df.drop_duplicates(subset="precedence_relation", keep="first",inplace=True)
    my_df.to_csv(csv_fp, index=False)
    
    
def process_results_bad_lb(path, forgive_missing=False, forgive_lb_missmatch=False):
    '''Processes results for when the bin pack lower bound bottom row is broken'''

    path = Path(path).expanduser()  # Expands '~'
    all_files = list(path.glob("*.csv"))  # Finds all .csv files
    li = []
    bad_instances = []
    for filename in all_files:
        print('processing', filename)
        clean_up_csv(filename)
        df = pd.read_csv(filename, index_col=None, header=0)
        if df.empty:
            print(f"Error: {filename} was empty")
            bad_instances.append({"instance":filename, "reason":"empty df"})
            continue
#         bin_lb = df[df["nodes"] == "SALBP_original"].copy()
#         bin_lb = bin_lb[bin_lb["bin_lb"].isna()==False].copy()
#         print(bin_lb)
#         salb_upper = df[df["nodes"] == "SALBP_original"].copy()
#         if salb_upper.empty:
#             bad_instances.append({"instance":filename, "reason":"no_ub"})
#             print(f"Error: {filename} is missing SALBP upper bound")
#             continue
#         if bin_lb.empty:
#             bad_instances.append({"instance":filename, "reason":"no_lb"})
#             print(f"Error: {filename} is missing a lower bound")
#             continue

#         df = df[df["precedence_relation"].isna() == False]
#         bin_lb.rename(columns = {"nodes":"bin_lb"}, inplace=True)
#         bin_lb = bin_lb[["instance", "bin_lb"]]

#         df = pd.merge(df, bin_lb, on="instance")
        if any(df['bin_lb'].isna() == True):
            print(f"Error: {filename} wasn't able to calculate a lower bound")
            bad_instances.append({"instance":filename, "reason":"no_bin_lb"})
            continue
        if any(df["no_stations"] < df["bin_lb"].astype(int)):
            print(f"Error: {filename} has a lower bound mismatch")
            bad_instances.append({"instance":filename, "reason":"lower bound mismatch"})
            if forgive_lb_missmatch == True:
                li.append(df)
            continue
        if len(df.index) != df["original_n_precedence_constraints"].iloc[0]+1:
            print("Error: ", filename, " is missing rows, please check")
            bad_instances.append({"instance":filename, "reason":"missing_edge"})
            if forgive_missing ==True:
                li.append(df)
        else:
            li.append(df)
            
    if len(li)>0:
        frame = pd.concat(li, axis=0, ignore_index=True)
    else:
        frame= None
    return frame, bad_instances
my_df, bad_bot_100 = process_results_bad_lb(r"data/results/bottleneck_100/instance_res", forgive_missing=True, forgive_lb_missmatch=False)
my_df_2, bad_chain_100 = process_results_bad_lb(r"data/results/chains_100/instance_res", forgive_missing=True, forgive_lb_missmatch=False)
my_df_3 , bad_unst_100 = process_results_bad_lb(r"data/results/unstructured_100/instance_res", forgive_missing=True, forgive_lb_missmatch=False)

processing data/results/bottleneck_100/instance_res/n_100_93.csv
processing data/results/bottleneck_100/instance_res/n_100_571.csv
Error:  data/results/bottleneck_100/instance_res/n_100_571.csv  is missing rows, please check
processing data/results/bottleneck_100/instance_res/n_100_6082.csv
processing data/results/bottleneck_100/instance_res/n_100_6725.csv
processing data/results/bottleneck_100/instance_res/n_100_5197.csv
processing data/results/bottleneck_100/instance_res/n_100_5146.csv
processing data/results/bottleneck_100/instance_res/n_100_4706.csv
processing data/results/bottleneck_100/instance_res/n_100_1058.csv
processing data/results/bottleneck_100/instance_res/n_100_6053.csv
processing data/results/bottleneck_100/instance_res/n_100_1439.csv
processing data/results/bottleneck_100/instance_res/n_100_4367.csv
processing data/results/bottleneck_100/instance_res/n_100_5955.csv
processing data/results/bottleneck_100/instance_res/n_100_5527.csv
processing data/results/bottleneck_100

In [None]:
all_bad_100 = pd.DataFrame( bad_instances + chain_bad + un_bad)
#all_bad_100.to_csv("data/results/missing_edge_100.csv", index=False)

In [None]:
all_bad_100

In [3]:
import os
def purge_bad_instances(bad_instances, reason="all"):
    for bad in bad_instances:
        file_path = bad['instance']
        if reason == "all":
            if os.path.exists(file_path):
                os.remove(file_path)
                print(f"{file_path} deleted.")
            else:
                print("File does not exist.")
        elif bad['reason'] == reason:
            os.remove(file_path)
            print(f"{file_path} deleted.")
        elif bad['reason'] == 'empty df':
            if os.path.exists(file_path):
                os.remove(file_path)
                print(f"{file_path} deleted.")
            
            
purge_bad_instances(bad_bot_100, reason="no_bin_lb")
purge_bad_instances(bad_chain_100, reason="no_bin_lb")
purge_bad_instances(bad_unst_100, reason="no_bin_lb")

In [4]:
my_df.to_csv("data/results/bottleneck_100/bottleneck_100.csv", index=False)
my_df_2.to_csv("data/results/chains_100/chains_100.csv", index=False)
my_df_3.to_csv("data/results/unstructured_100/unstructured_100.csv", index=False)

In [9]:
my_df, bad_large = process_results_bad_lb(r"data/results/Otto/large/instance_res/", forgive_missing=True, forgive_lb_missmatch=False)

processing data/results/Otto/large/instance_res/instance_n=100_264.csv
processing data/results/Otto/large/instance_res/instance_n=100_512.csv
processing data/results/Otto/large/instance_res/instance_n=100_173.csv
processing data/results/Otto/large/instance_res/instance_n=100_218.csv
processing data/results/Otto/large/instance_res/instance_n=100_17.csv
processing data/results/Otto/large/instance_res/instance_n=100_436.csv
processing data/results/Otto/large/instance_res/instance_n=100_391.csv
processing data/results/Otto/large/instance_res/instance_n=100_340.csv
processing data/results/Otto/large/instance_res/instance_n=100_108.csv
processing data/results/Otto/large/instance_res/instance_n=100_10.csv
processing data/results/Otto/large/instance_res/instance_n=100_396.csv
processing data/results/Otto/large/instance_res/instance_n=100_431.csv
processing data/results/Otto/large/instance_res/instance_n=100_347.csv
processing data/results/Otto/large/instance_res/instance_n=100_263.csv
processi

In [6]:
badlarge_df = pd.DataFrame(bad_large)
badlarge_df.to_csv('data/results/Otto/large/bad_large.csv', index=False)

In [7]:
badlarge_df

In [8]:
badlarge_df['instance'].iloc[0]

PosixPath('data/results/Otto/large/instance_res/instance_n=100_140.csv')

In [7]:
my_df['instance'].nunique()

525

In [10]:
my_df.to_csv("data/results/Otto/large/large_collated.csv", index=False)

In [6]:
no = 50

my_df, bad_bot_90 = process_results_bad_lb(f"data/results/bottleneck_{no}/instance_res", forgive_missing=True, forgive_lb_missmatch=False)
my_df_2, bad_chain_90 = process_results_bad_lb(f"data/results/chains_{no}/instance_res", forgive_missing=True, forgive_lb_missmatch=False)
my_df_3 , bad_unst_90 = process_results_bad_lb(f"data/results/unstructured_{no}/instance_res", forgive_missing=True, forgive_lb_missmatch=False)
purge_bad_instances(bad_bot_90, reason="no_bin_lb")
purge_bad_instances(bad_chain_90, reason="no_bin_lb")
purge_bad_instances(bad_unst_90, reason="no_bin_lb")
my_df.to_csv(f"data/results/bottleneck_{no}/bottleneck_{no}.csv", index=False)
my_df_2.to_csv(f"data/results/chains_{no}/chains_{no}.csv", index=False)
my_df_3.to_csv(f"data/results/unstructured_{no}/unstructured_{no}.csv", index=False)


processing data/results/chains_50/instance_res/n_50_3560.csv
processing data/results/chains_50/instance_res/n_50_1592.csv
processing data/results/chains_50/instance_res/n_50_4919.csv
processing data/results/chains_50/instance_res/n_50_1235.csv
processing data/results/chains_50/instance_res/n_50_3025.csv
processing data/results/chains_50/instance_res/n_50_3782.csv
processing data/results/chains_50/instance_res/n_50_2665.csv
processing data/results/chains_50/instance_res/n_50_1770.csv
processing data/results/chains_50/instance_res/n_50_4789.csv
processing data/results/chains_50/instance_res/n_50_2204.csv
processing data/results/chains_50/instance_res/n_50_3444.csv
processing data/results/chains_50/instance_res/n_50_3836.csv
processing data/results/chains_50/instance_res/n_50_1311.csv
processing data/results/chains_50/instance_res/n_50_1815.csv
processing data/results/chains_50/instance_res/n_50_1467.csv
processing data/results/chains_50/instance_res/n_50_4339.csv
processing data/results/

In [11]:
bad_bot_100_df = pd.DataFrame(bad_bot_100)
bad_bot_100_df.to_csv("data/results/bottleneck_100/bad_bot_100.csv", index=False)
bad_chain_100_df = pd.DataFrame(bad_chain_100)
bad_chain_100_df.to_csv("data/results/chains_100/bad_chain_100.csv", index=False)
bad_unst_100_df = pd.DataFrame(bad_unst_100)
bad_unst_100_df.to_csv("data/results/unstructured_100/bad_unst_100.csv", index=False)


In [12]:
bad_bot_90_df = pd.DataFrame(bad_bot_90)
bad_bot_90_df.to_csv("data/results/bottleneck_90/bad_bot_90.csv", index=False)
bad_chain_90_df = pd.DataFrame(bad_chain_90)
bad_chain_90_df.to_csv("data/results/chains_90/bad_chain_90.csv", index=False)
bad_unst_90_df = pd.DataFrame(bad_unst_90)
bad_unst_90_df.to_csv("data/results/unstructured_90/bad_unst_90.csv", index=False)

In [15]:
bad_bot_90_df

Unnamed: 0,instance,reason
0,data/results/bottleneck_90/n_90_6313.csv,missing_edge
1,data/results/bottleneck_90/n_90_2823.csv,missing_edge
2,data/results/bottleneck_90/n_90_6328.csv,missing_edge
3,data/results/bottleneck_90/n_90_144.csv,missing_edge
4,data/results/bottleneck_90/n_90_3978.csv,missing_edge
5,data/results/bottleneck_90/n_90_6327.csv,missing_edge
6,data/results/bottleneck_90/n_90_4013.csv,missing_edge
7,data/results/bottleneck_90/n_90_2906.csv,missing_edge
8,data/results/bottleneck_90/n_90_1939.csv,missing_edge
9,data/results/bottleneck_90/n_90_6177.csv,missing_edge


In [None]:
all_bad_90 = pd.DataFrame( bad_instances + chain_bad + un_bad)
all_bad_90.to_csv("DADA/DADA/data/results/missing_edge_90.csv", index=False)


In [None]:
# import os
# import shutil
# import re

# # Folder where the files are currently located
# source_folder = 'data/results/Otto/'  # change to your directory if needed
# target_folder = 'data/results/Otto/large'



# # Loop through files in the source folder
# for filename in os.listdir(source_folder):
#     # Match pattern like 'largeinstance_n=100_22.csv'
#     if filename.startswith('largeinstance_n=') and filename.endswith('.csv'):
#         # Remove the 'large' prefix
#         new_filename = filename.replace('large', '', 1)
#         src_path = os.path.join(source_folder, filename)
#         dst_path = os.path.join(target_folder, new_filename)
        
#         # Move and rename
#         shutil.move(src_path, dst_path)
#         print(f"Moved: {filename} → {target_folder}/{new_filename}")


In [None]:
interesting_instances = my_df[my_df["no_stations"].astype(int) != my_df["bin_lb"].astype(int)]
interesting_instances

In [13]:
bot_100 = pd.read_csv("data/results/bottleneck_100.csv")
chains_100 = pd.read_csv("data/results/chains_100.csv")
unst_100 = pd.read_csv("data/results/unstructured_100.csv")
print(bot_100['instance'].nunique(), chains_100['instance'].nunique(), unst_100['instance'].nunique())
print('total: ', bot_100['instance'].nunique()+ chains_100['instance'].nunique()+ unst_100['instance'].nunique())

77 201 54
total:  332


In [None]:
bot_100

In [14]:
bot_90 = pd.read_csv("data/results/bottleneck_90.csv")
chains_90 = pd.read_csv("data/results/chains_90.csv")
unst_90 = pd.read_csv("data/results/unstructured_90.csv")
print(bot_90['instance'].nunique(), chains_90['instance'].nunique(), unst_90['instance'].nunique())
print('total: ', bot_90['instance'].nunique()+ chains_90['instance'].nunique()+ unst_90['instance'].nunique())

254 270 263
total:  787


In [None]:
# from multiprocessing.dummy import Pool as ThreadPool
# import multiprocessing
# import tempfile

# def save_backup(backup_name, result):
#     intermediate = pd.DataFrame([result])
#     my_file = Path(backup_name)
#     if my_file.is_file():
#         intermediate.to_csv(backup_name, mode='a', header=False)
#     else:
#         intermediate.to_csv(backup_name)
    


# def fix_partial_result(alb_dict, ex_fp, out_fp, branch=1):
#     SALBP_dict_orig = alb_dict

#     instance_fp = SALBP_dict_orig['name']
#     results = []
#     # Extract instance name from file path
#     instance_name = str(instance_fp).split("/")[-1].split(".alb")[0]

#     if not os.path.exists(out_fp):
#         os.makedirs(out_fp)
#         orig_data = pd.DataFrame()    
#     else:
#         orig_data = pd.read_csv(out_fp)
#     print("running: ", instance_name, " saving to output ", out_fp)
#     # Use a unique temporary ALB file per process
#     with tempfile.NamedTemporaryFile(suffix=".alb", delete=True) as temp_alb:
#         temp_alb_path = temp_alb.name  # Path to temporary file
#         orig_prec = len(SALBP_dict_orig["precedence_relations"])
#         #original problem
#         SALBP_dict = deepcopy(SALBP_dict_orig)
#         if (orig_data.empty or orig_data[orig_data["nodes"] == "SALBP_original"].empty):
#             write_to_alb(SALBP_dict, temp_alb_path)
#             output = subprocess.run([ex_fp, "-m", f"{branch}", "-b", "1", temp_alb_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
#             # print("Return code:", output.returncode)
#             # print("STDOUT:", output.stdout.decode())

#             salbp_sol, optimal, cpu, bin_lb = parse_bb_salb1_out(output)
#             if not bin_lb:
#                 print("STDERR:", output.stderr.decode() if output.stderr else "No stderr captured")
#                 print("ERROR, no bin_lb", output)
#             orig_prob = {
#                 "instance": instance_name,
#                 "precedence_relation": "None",
#                 "nodes": "SALBP_original",
#                 "no_stations": salbp_sol,
#                 "original_n_precedence_constraints": orig_prec,
#                 "optimal": optimal,
#                 "cpu": cpu,
#                 "bin_lb": bin_lb
#             }
#             results.append(orig_prob)
#             save_backup(out_fp, orig_prob)
#             #Tracking if instance autocompleted because bp=salbp and setting defaults
#             cpu = -1 
#             no_stations = salbp_sol

#         #proceeds to precedence constraint removal, if bin_lb != no stations
#         for j, relation in enumerate(SALBP_dict_orig["precedence_relations"]):
#             if not orig_data.empty and not orig_data[orig_data["precedence_relation"] ==j].empty:
#                 print("Skipping relation that already exists: ", relation)
#                 continue
#             print("removing edge: ", relation)
#             SALBP_dict = deepcopy(SALBP_dict_orig)
#             SALBP_dict = precedence_removal(SALBP_dict, j)
#             if bin_lb != salbp_sol: #If bin_lb==salbp_sol, then we don't need to do any precedence removal
#                 write_to_alb(SALBP_dict, temp_alb_path)
#                 output = subprocess.run([ex_fp, "-m", f"{branch}", temp_alb_path], stdout=subprocess.PIPE)
#                 # print("Return code:", output.returncode)
#                 # print("STDOUT:", output.stdout.decode())
#                 print("STDERR:", output.stderr.decode() if output.stderr else "No stderr captured")
#                 no_stations, optimal, cpu, _ = parse_bb_salb1_out(output)
#             result = {
#                 "instance": instance_name,
#                 "precedence_relation": j,
#                 "nodes": relation,
#                 "no_stations": no_stations,
#                 "original_n_precedence_constraints": orig_prec,
#                 "optimal": optimal,
#                 "cpu": cpu,
#                 "bin_lb": bin_lb
#             }
#             save_backup(out_fp, result)
#             results.append(result)

#     return results



In [None]:
# bad_bot_100_df = pd.DataFrame(bad_bot_100)
# bad_bot_100_df.to_csv("data/results/bad_bot_100.csv", index=False)


In [None]:
# def retry_bad_instances(bad_data_fp, data_pickle, ex_fp):
    
#     bad_data = pd.read_csv(bad_data_fp)
#     alb_files = open_salbp_pickle(data_pickle)
#     for instance_fp in bad_data['instance']:
#         instance_name = instance_fp.split('/')[-1].split('.')[0]  
#         clean_up_csv(instance_fp)
#         for alb in alb_files:
#             name = str(alb['name']).split('/')[-1].split('.')[0]
#             if name == instance_name:
#                 print("fixing ", name , " at ", instance_fp)
#                 fix_partial_result(alb, ex_fp, instance_fp, branch=1)
                
# retry_bad_instances("data/results/bad_bot_100.csv", "data/raw/pkl_datasets/n_100_bottleneck.pkl", "../BBR-for-SALBP1/SALB/SALB/salb") 


In [5]:
test_50, bad_50 = process_results_bad_lb(r"data/results/dummy/inst_res", forgive_missing=True, forgive_lb_missmatch=False)
test_50

processing data/results/dummy/inst_res/n_50_62.csv
processing data/results/dummy/inst_res/n_50_1.csv
processing data/results/dummy/inst_res/n_50_131.csv
processing data/results/dummy/inst_res/n_50_1811.csv


Unnamed: 0,instance,precedence_relation,nodes,no_stations,original_n_precedence_constraints,optimal,cpu,bin_lb
0,n_50_62,,SALBP_original,18,87,1,0.13,17
1,n_50_62,0.0,"['1', '6']",18,87,1,0.05,17
2,n_50_62,1.0,"['1', '8']",18,87,1,0.04,17
3,n_50_62,2.0,"['1', '9']",18,87,1,0.04,17
4,n_50_62,3.0,"['1', '22']",18,87,1,0.04,17
...,...,...,...,...,...,...,...,...
275,n_50_1811,15.0,"['11', '14']",25,71,1,0.00,25
276,n_50_1811,16.0,"['11', '16']",26,71,1,0.03,25
277,n_50_1811,17.0,"['11', '17']",25,71,1,0.00,25
278,n_50_1811,18.0,"['12', '31']",26,71,1,0.09,25


In [4]:
bad_50 = pd.DataFrame(bad_50)
bad_50.to_csv("data/results/dummy/broken_50.csv",index=False)

In [None]:
large_fp =  "data/raw/pkl_datasets/large.pkl"
#large_edge = generate_edge_data(large_fp)
#large_edge.to_csv('data/raw/edge_features/large_edges.csv', index=False)
#prep_data_for_gnn_2(result_csv, graph_data_df_fp, edge_data_df_fp, gnn_dat_out, ml_dat_out, remove_incomplete= True, tolerance=0, obj_col="no_stations")

In [2]:
large_fp= "/home/jot240/DADA/DADA/data/results/Otto/large/large_collated.csv"
graph_data_df_fp = "/home/jot240/DADA/DADA/data/results/Otto/large/graph_features_large.csv"
edge_data_df_fp = "/home/jot240/DADA/DADA/data/results/Otto/large/edge_features_large.csv"
gnn_dat_out =  "/home/jot240/DADA/DADA/pytorch_datasets/large/large_gnn_ready.csv"
ml_dat_out = "/home/jot240/DADA/DADA/data/results/Otto/large/large_ml_ready.csv"
prep_data_for_gnn_2(large_fp, graph_data_df_fp, edge_data_df_fp, gnn_dat_out, ml_dat_out)

res data columns after ub Index(['instance', 'precedence_relation', 'nodes', 'no_stations',
       'original_n_precedence_constraints', 'optimal', 'cpu', 'bin_lb',
       's_orig'],
      dtype='object')
removing:  []
graph_data cols Index(['instance', 'min_div_c', 'max_div_c', 'sum_div_c', 'std_div_c',
       'order_strength', 'average_number_of_immediate_predecessors',
       'max_degree', 'max_in_degree', 'max_out_degree', 'divergence_degree',
       'convergence_degree', 'n_bottlenecks', 'share_of_bottlenecks',
       'avg_degree_of_bottlenecks', 'n_chains', 'avg_chain_length',
       'nodes_in_chains', 'n_stages', 'n_isolated_nodes',
       'share_of_isolated_nodes', 'n_tasks_without_predecessors',
       'share_of_tasks_without_predecessors', 'avg_tasks_per_stage',
       'graph_feature_time'],
      dtype='object')
edge data columns Index(['instance', 'edge', 'idx', 'parent_weight', 'parent_pos_weight',
       'child_weight', 'child_pos_weight', 'neighborhood_min',
       'neigh

Unnamed: 0,instance,precedence_relation,edge,no_stations,n_edges,optimal,cpu,bin_lb,s_orig,min_div_c,...,child_rw_mean_n_unique_nodes,child_rw_mean_walk_length,child_rw_min,child_rw_max,child_rw_mean,child_rw_std,child_rw_n_unique_nodes,child_rw_elapsed_time,min,max
0,instance_n=100_264,0.0,"['1', '10']",15,137,1.0,-1.0,15.0,15,0.021,...,7.2,10.0,30,433,150.509091,61.917797,19,0.218188,15,15
1,instance_n=100_264,1.0,"['2', '11']",15,137,1.0,-1.0,15.0,15,0.021,...,6.2,10.0,40,210,101.654545,41.664992,12,0.218188,15,15
2,instance_n=100_264,2.0,"['3', '69']",15,137,1.0,-1.0,15.0,15,0.021,...,6.4,10.0,31,231,116.600000,57.040847,16,0.218188,15,15
3,instance_n=100_264,3.0,"['3', '75']",15,137,1.0,-1.0,15.0,15,0.021,...,7.6,10.0,33,256,160.872727,56.800786,25,0.218188,15,15
4,instance_n=100_264,4.0,"['4', '13']",15,137,1.0,-1.0,15.0,15,0.021,...,7.2,10.0,30,433,143.618182,78.987108,24,0.218188,15,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85847,instance_n=100_255,151.0,"['92', '97']",14,156,1.0,-1.0,14.0,14,0.022,...,9.2,10.0,24,303,150.127273,75.393044,25,0.442671,14,14
85848,instance_n=100_255,152.0,"['93', '98']",14,156,1.0,-1.0,14.0,14,0.022,...,8.2,10.0,25,276,140.763636,67.893490,25,0.442671,14,14
85849,instance_n=100_255,153.0,"['94', '99']",14,156,1.0,-1.0,14.0,14,0.022,...,6.4,10.0,50,303,120.018182,62.363011,14,0.442671,14,14
85850,instance_n=100_255,154.0,"['95', '98']",14,156,1.0,-1.0,14.0,14,0.022,...,8.2,10.0,25,276,140.763636,67.893490,25,0.442671,14,14


In [7]:
large_res_df = pd.read_csv(large_res)
large_res_df['instance'].nunique()

525

In [13]:
large_ml_ready = pd.read_csv("/home/jot240/DADA/DADA/data/results/Otto/large/large_ml_ready.csv")
large_ml_ready['instance'].nunique()

525