In [2]:

import utils
from Gian_experimental.NSGAIICustom.evolving_genome_threshold import NCSolutionWithGT
from Core.SearchSpace import SearchSpace
from Core.PRef import PRef
from Gian_experimental.NSGAIICustom.testing_in_vitro.SPRef import OptimisedSPref
import json
import numpy as np

temp_vector_path = r"C:\Users\gac8\PycharmProjects\PSSearch\retail_forecasting_data_collection\data\many_hot_vectors_250_random.csv"
temp_fitness_path = r"C:\Users\gac8\PycharmProjects\PSSearch\retail_forecasting_data_collection\data\fitness_250_random.csv"

all_vectors = np.loadtxt(temp_vector_path, delimiter=",", dtype=int)

# fitnesses_for_GA, hits_20, hits_100 = np.genfromtxt(temp_fitness_path, delimiter=",", dtype=float, usecols=(2, 4, 5))
fitnesses_for_GA, hits_100, hits_20 = np.genfromtxt(temp_fitness_path, delimiter=",", dtype=float, usecols=(2, 4, 6)).T

print("column 2, expected to be normal fitnesses ", fitnesses_for_GA)
print(f"column 4, expected to be HR 100 (high average = {np.average(hits_100):.3f})", hits_100)
print(f"column 6, expected to be HR 20 (low average = {np.average(hits_20):.3f})", hits_20)


train_test_partitions_by_seed_path = r"C:\Users\gac8\PycharmProjects\PSSearch\retail_forecasting_data_collection\data\test_indices.json"
train_test_partitions_by_seed = json.load(open(train_test_partitions_by_seed_path, "r"))

print(f"We have partitions for {len(train_test_partitions_by_seed)} seeds")


def construct_temp_spRef(which: str, seed: int) -> OptimisedSPref:
    # we make a pRef which is optimised (we represent a session as the set of the variables that are ones)
    # the fitnesses in this pRef are just the index of their row [0, 1, 2,  ...]
    
    # this allows to get any fitness we want without having to make a lot of slow code!
    
    test_indices = train_test_partitions_by_seed[str(seed)]
    if which == "test":
        row_indices = test_indices
    else:
        row_indices = list(set(range(len(all_vectors))) - set(test_indices))
        
    vectors = all_vectors[row_indices]
    trad_pRef = PRef(fitness_array=np.array(row_indices),
                           full_solution_matrix=vectors,
                           search_space=SearchSpace([2 for _ in range(250)]))
    
    sPRef = OptimisedSPref.from_pRef(trad_pRef)
    del trad_pRef
    
    return sPRef


def get_info_for_individual_ps(ps: NCSolutionWithGT, temp_test_spRef, temp_train_spRef):
    matched_test_indices, unmatched_test_indices = temp_test_spRef.partition(ps, threshold=ps.genome_threshold)
    
    matched_train_indices, unmatched_train_indices = temp_train_spRef.partition(ps, threshold=ps.genome_threshold)
    
    average_test_HR_20 = np.average(hits_20[matched_test_indices])
    average_test_HR_100 = np.average(hits_100[matched_test_indices])
    univ_test = len(matched_test_indices) / (len(matched_test_indices) + len(unmatched_test_indices))
    
    average_train_HR_20 = np.average(hits_20[matched_train_indices])
    average_train_HR_100 = np.average(hits_100[matched_train_indices])
    univ_train = len(matched_train_indices) / (len(matched_train_indices) + len(unmatched_train_indices))
    
    return univ_test, average_test_HR_20, average_test_HR_100, univ_train, average_train_HR_20, average_train_HR_100
    


column 2, expected to be normal fitnesses  [11.18746567 17.32204247  9.72239876 ...  6.85158873  8.78421211
 10.7397728 ]
column 4, expected to be HR 100 (high average = 0.547) [1. 1. 1. ... 0. 1. 1.]
column 6, expected to be HR 20 (low average = 0.245) [1. 1. 0. ... 0. 1. 1.]
We have partitions for 100 seeds


We will parse some json files in a folder, and we want to make a dataframe with the columns:
* code_name (a string)
* length (an int)
* genome_threshold (an int)
* test_coverage
* test_HR_20
* test_HR_100
* train_coverage
* train_HR_20
* train_HR_100

For every file in results_folder that ends in ".json":
* let seed = data["seed"]
* let test_sessions = construct_temp_spRef(which = "test", seed=seed)
* let train_sessions = construct_temp_spRef(which = "train", seed=seed)
* read data["data"], which contains many runs with different settings. Iterate over each run
    * each run has a code_name, read it in run["config"]["code_name"]
    * each run has a set of produced outputs, under run["results"]
        * for each output, let ps = NCSolutionWithGT(values = set(output["pattern"], genome_threshold = output["threshold"]
        * then let length = len(ps)
        * genome_threshold = ps.genome_threshold
        * test_coverage, ..., train_HR_100  = get_info_for_individual_ps(ps, test_sessions, train_sessions)
        * put the relevant information as a row in the dataframe mentioned in the beginning


In [3]:
from tqdm import tqdm
import os
import json
import pandas as pd


def process_results_folder(results_folder):
    rows = []

    for filename in os.listdir(results_folder):
        print("processing file ", filename)
        if not filename.endswith(".json"):
            continue

        filepath = os.path.join(results_folder, filename)
        with open(filepath, 'r') as f:
            data = json.load(f)

        seed = data["extra_info"]["seed"]
        test_sessions = construct_temp_spRef(which="test", seed=seed)
        train_sessions = construct_temp_spRef(which="train", seed=seed)

        for run in data["data"]:
            code_name = run["config"]["code_name"]
            for output in tqdm(run["results"]):
                ps = NCSolutionWithGT(
                    values=set(output["pattern"]),
                    genome_threshold=output["threshold"]
                )
                length = len(ps)
                genome_threshold = ps.genome_threshold

                (
                    test_coverage,
                    test_HR_20,
                    test_HR_100,
                    train_coverage,
                    train_HR_20,
                    train_HR_100
                ) = get_info_for_individual_ps(ps, test_sessions, train_sessions)

                row = {
                    "code_name": code_name,
                    "length": length,
                    "genome_threshold": genome_threshold,
                    "test_coverage": test_coverage,
                    "test_HR_20": test_HR_20,
                    "test_HR_100": test_HR_100,
                    "train_coverage": train_coverage,
                    "train_HR_20": train_HR_20,
                    "train_HR_100": train_HR_100,
                }

                rows.append(row)

    return pd.DataFrame(rows)

# Usage
# df = process_results_folder("path/to/results_folder")
# print(df.head())


In [4]:
results_folder = r"C:\Users\gac8\PycharmProjects\PSSearch\retail_forecasting_data_collection\local_results_baseline"
results_df = process_results_folder(results_folder)

display(results_df)

processing file  result_07-03-H12'm'46's42.json


100%|██████████| 9/9 [00:00<00:00, 899.34it/s]


processing file  result_07-03-H12'm'49's59.json


100%|██████████| 15/15 [00:00<00:00, 308.00it/s]


processing file  result_07-03-H12'm'53's06.json


100%|██████████| 12/12 [00:00<00:00, 313.59it/s]


processing file  result_07-03-H12'm'56's33.json


100%|██████████| 7/7 [00:00<00:00, 421.18it/s]


processing file  result_07-03-H12'm'59's28.json


100%|██████████| 12/12 [00:00<00:00, 316.98it/s]


processing file  result_07-03-H13'm'02's48.json


100%|██████████| 10/10 [00:00<00:00, 495.97it/s]


processing file  result_07-03-H13'm'05's31.json


100%|██████████| 8/8 [00:00<00:00, 481.16it/s]


processing file  result_07-03-H13'm'08's33.json


100%|██████████| 12/12 [00:00<00:00, 555.65it/s]


processing file  result_07-03-H13'm'11's55.json


100%|██████████| 10/10 [00:00<00:00, 510.77it/s]


processing file  result_07-03-H13'm'14's49.json


100%|██████████| 6/6 [00:00<00:00, 229.79it/s]


processing file  result_07-03-H13'm'20's00.json


100%|██████████| 7/7 [00:00<00:00, 447.83it/s]


processing file  result_07-03-H13'm'23's06.json


100%|██████████| 11/11 [00:00<00:00, 328.24it/s]


processing file  result_07-03-H13'm'27's37.json


100%|██████████| 13/13 [00:00<00:00, 416.01it/s]


processing file  result_07-03-H13'm'30's33.json


100%|██████████| 13/13 [00:00<00:00, 429.12it/s]


processing file  result_07-03-H13'm'34's06.json


100%|██████████| 15/15 [00:00<00:00, 297.70it/s]


processing file  result_07-03-H13'm'38's16.json


100%|██████████| 12/12 [00:00<00:00, 268.07it/s]


processing file  result_07-03-H13'm'41's28.json


100%|██████████| 6/6 [00:00<00:00, 384.05it/s]


processing file  result_07-03-H13'm'45's29.json


100%|██████████| 11/11 [00:00<00:00, 364.64it/s]


processing file  result_07-03-H13'm'48's56.json


100%|██████████| 9/9 [00:00<00:00, 497.51it/s]


processing file  result_07-03-H13'm'53's18.json


100%|██████████| 8/8 [00:00<00:00, 511.96it/s]


processing file  result_07-03-H13'm'56's46.json


100%|██████████| 8/8 [00:00<00:00, 353.52it/s]


processing file  result_07-03-H13'm'59's39.json


100%|██████████| 14/14 [00:00<00:00, 497.32it/s]


processing file  result_07-03-H14'm'02's34.json


100%|██████████| 13/13 [00:00<00:00, 449.59it/s]


processing file  result_07-03-H14'm'06's18.json


100%|██████████| 16/16 [00:00<00:00, 396.36it/s]


processing file  result_07-03-H14'm'10's21.json


100%|██████████| 13/13 [00:00<00:00, 429.59it/s]


processing file  result_07-03-H14'm'14's03.json


100%|██████████| 10/10 [00:00<00:00, 583.94it/s]


processing file  result_07-03-H14'm'17's22.json


100%|██████████| 11/11 [00:00<00:00, 222.94it/s]


processing file  result_07-03-H14'm'20's26.json


100%|██████████| 14/14 [00:00<00:00, 411.93it/s]


processing file  result_07-03-H14'm'27's24.json


100%|██████████| 12/12 [00:00<00:00, 595.16it/s]


processing file  result_07-03-H14'm'31's35.json


100%|██████████| 10/10 [00:00<00:00, 496.53it/s]


processing file  result_07-03-H14'm'35's09.json


100%|██████████| 12/12 [00:00<00:00, 693.80it/s]


processing file  result_07-03-H14'm'38's24.json


100%|██████████| 15/15 [00:00<00:00, 510.49it/s]


processing file  result_07-03-H14'm'44's29.json


100%|██████████| 12/12 [00:00<00:00, 297.59it/s]


processing file  result_07-03-H14'm'48's03.json


100%|██████████| 9/9 [00:00<00:00, 522.19it/s]


processing file  result_07-03-H14'm'51's27.json


100%|██████████| 12/12 [00:00<00:00, 256.02it/s]


processing file  result_07-03-H14'm'54's57.json


100%|██████████| 9/9 [00:00<00:00, 496.49it/s]


processing file  result_07-03-H14'm'58's49.json


100%|██████████| 14/14 [00:00<00:00, 410.22it/s]


processing file  result_07-03-H15'm'02's40.json


100%|██████████| 6/6 [00:00<00:00, 564.89it/s]


processing file  result_07-03-H15'm'05's50.json


100%|██████████| 12/12 [00:00<00:00, 335.31it/s]


processing file  result_07-03-H15'm'10's04.json


100%|██████████| 10/10 [00:00<00:00, 999.19it/s]


processing file  result_07-03-H15'm'14's03.json


100%|██████████| 11/11 [00:00<00:00, 659.63it/s]


processing file  result_07-03-H15'm'18's10.json


100%|██████████| 13/13 [00:00<00:00, 747.28it/s]


processing file  result_07-03-H15'm'22's03.json


100%|██████████| 7/7 [00:00<00:00, 344.15it/s]


processing file  result_07-03-H15'm'25's30.json


100%|██████████| 7/7 [00:00<00:00, 379.77it/s]


processing file  result_07-03-H15'm'28's56.json


100%|██████████| 13/13 [00:00<00:00, 430.76it/s]


processing file  result_07-03-H15'm'32's26.json


100%|██████████| 9/9 [00:00<00:00, 496.61it/s]


processing file  result_07-03-H15'm'36's07.json


100%|██████████| 12/12 [00:00<00:00, 541.54it/s]


processing file  result_07-03-H15'm'39's34.json


100%|██████████| 9/9 [00:00<00:00, 293.55it/s]


processing file  result_07-03-H15'm'42's55.json


100%|██████████| 10/10 [00:00<00:00, 497.40it/s]


processing file  result_07-03-H15'm'46's53.json


100%|██████████| 9/9 [00:00<00:00, 563.79it/s]


processing file  result_07-03-H15'm'50's05.json


100%|██████████| 10/10 [00:00<00:00, 552.26it/s]


processing file  result_07-03-H15'm'53's17.json


100%|██████████| 11/11 [00:00<00:00, 529.97it/s]


processing file  result_07-03-H15'm'56's13.json


100%|██████████| 12/12 [00:00<00:00, 312.40it/s]


processing file  result_07-03-H15'm'59's52.json


100%|██████████| 13/13 [00:00<00:00, 322.13it/s]


processing file  result_07-03-H16'm'03's12.json


100%|██████████| 10/10 [00:00<00:00, 449.31it/s]


processing file  result_07-03-H16'm'06's59.json


100%|██████████| 13/13 [00:00<00:00, 642.13it/s]


processing file  result_07-03-H16'm'09's58.json


100%|██████████| 7/7 [00:00<00:00, 857.81it/s]


processing file  result_07-03-H16'm'13's37.json


100%|██████████| 9/9 [00:00<00:00, 576.04it/s]


processing file  result_07-03-H16'm'17's22.json


100%|██████████| 13/13 [00:00<00:00, 348.63it/s]


processing file  result_07-03-H16'm'20's29.json


100%|██████████| 11/11 [00:00<00:00, 605.94it/s]


processing file  result_07-03-H16'm'24's04.json


100%|██████████| 11/11 [00:00<00:00, 351.92it/s]


processing file  result_07-03-H16'm'27's44.json


100%|██████████| 7/7 [00:00<00:00, 1393.13it/s]


processing file  result_07-03-H16'm'31's31.json


100%|██████████| 10/10 [00:00<00:00, 561.73it/s]


processing file  result_07-03-H16'm'35's21.json


100%|██████████| 8/8 [00:00<00:00, 511.95it/s]


processing file  result_07-03-H16'm'39's45.json


100%|██████████| 6/6 [00:00<00:00, 595.15it/s]


processing file  result_07-03-H16'm'42's59.json


100%|██████████| 11/11 [00:00<00:00, 271.84it/s]


processing file  result_07-03-H16'm'46's38.json


100%|██████████| 9/9 [00:00<00:00, 475.56it/s]


processing file  result_07-03-H16'm'49's36.json


100%|██████████| 8/8 [00:00<00:00, 441.97it/s]


processing file  result_07-03-H16'm'53's50.json


100%|██████████| 13/13 [00:00<00:00, 396.98it/s]


processing file  result_07-03-H16'm'58's07.json


100%|██████████| 11/11 [00:00<00:00, 339.63it/s]


processing file  result_07-03-H17'm'01's46.json


100%|██████████| 6/6 [00:00<?, ?it/s]


processing file  result_07-03-H17'm'05's08.json


100%|██████████| 14/14 [00:00<00:00, 347.71it/s]


processing file  result_07-03-H17'm'09's41.json


100%|██████████| 9/9 [00:00<00:00, 222.75it/s]


processing file  result_07-03-H17'm'13's06.json


100%|██████████| 6/6 [00:00<00:00, 590.80it/s]


processing file  result_07-03-H17'm'16's54.json


100%|██████████| 9/9 [00:00<00:00, 890.11it/s]


processing file  result_07-03-H17'm'20's08.json


100%|██████████| 9/9 [00:00<00:00, 376.23it/s]


processing file  result_07-03-H17'm'24's03.json


100%|██████████| 8/8 [00:00<00:00, 794.96it/s]


processing file  result_07-03-H17'm'28's23.json


100%|██████████| 8/8 [00:00<00:00, 397.93it/s]


processing file  result_07-03-H17'm'31's15.json


100%|██████████| 12/12 [00:00<00:00, 462.16it/s]


processing file  result_07-03-H17'm'35's31.json


100%|██████████| 7/7 [00:00<00:00, 692.82it/s]


processing file  result_07-03-H17'm'38's56.json


100%|██████████| 8/8 [00:00<00:00, 658.15it/s]


processing file  result_07-03-H17'm'42's21.json


100%|██████████| 10/10 [00:00<00:00, 282.95it/s]


processing file  result_07-03-H17'm'46's31.json


100%|██████████| 5/5 [00:00<00:00, 423.94it/s]


processing file  result_07-03-H17'm'49's47.json


100%|██████████| 13/13 [00:00<00:00, 429.06it/s]


processing file  result_07-03-H17'm'53's44.json


100%|██████████| 13/13 [00:00<00:00, 295.92it/s]


processing file  result_07-03-H17'm'57's19.json


100%|██████████| 9/9 [00:00<00:00, 255.57it/s]


processing file  result_07-03-H18'm'00's57.json


100%|██████████| 7/7 [00:00<00:00, 545.79it/s]


processing file  result_07-03-H18'm'04's55.json


100%|██████████| 9/9 [00:00<00:00, 497.72it/s]


processing file  result_07-03-H18'm'07's53.json


100%|██████████| 10/10 [00:00<00:00, 1063.79it/s]


processing file  result_07-03-H18'm'11's16.json


100%|██████████| 12/12 [00:00<00:00, 393.47it/s]


processing file  result_07-03-H18'm'15's02.json


100%|██████████| 11/11 [00:00<00:00, 694.06it/s]


processing file  result_07-03-H18'm'18's52.json


100%|██████████| 7/7 [00:00<00:00, 869.34it/s]


processing file  result_07-03-H18'm'22's11.json


100%|██████████| 10/10 [00:00<00:00, 317.74it/s]


processing file  result_07-03-H18'm'25's01.json


100%|██████████| 12/12 [00:00<00:00, 452.61it/s]


processing file  result_07-03-H18'm'28's28.json


100%|██████████| 9/9 [00:00<00:00, 297.86it/s]


processing file  result_07-03-H18'm'31's36.json


100%|██████████| 10/10 [00:00<00:00, 354.66it/s]


processing file  result_07-03-H18'm'34's43.json


100%|██████████| 15/15 [00:00<00:00, 391.83it/s]


processing file  result_07-03-H18'm'37's58.json


100%|██████████| 17/17 [00:00<00:00, 561.93it/s]


processing file  result_07-03-H18'm'41's17.json


100%|██████████| 14/14 [00:00<00:00, 277.91it/s]


processing file  result_07-03-H18'm'44's15.json


100%|██████████| 13/13 [00:00<00:00, 637.61it/s]


Unnamed: 0,code_name,length,genome_threshold,test_coverage,test_HR_20,test_HR_100,train_coverage,train_HR_20,train_HR_100
0,[OS OM OC][Len MFit CAtom][GNone],2,,0.025933,0.391941,0.677656,0.026501,0.355735,0.687276
1,[OS OM OC][Len MFit CAtom][GNone],3,,0.028023,0.335593,0.661017,0.028139,0.319831,0.657384
2,[OS OM OC][Len MFit CAtom][GNone],2,,0.031063,0.336391,0.663609,0.033031,0.350827,0.681524
3,[OS OM OC][Len MFit CAtom][GNone],2,,0.023938,0.365079,0.686508,0.026073,0.389800,0.705829
4,[OS OM OC][Len MFit CAtom][GNone],2,,0.026028,0.321168,0.682482,0.027261,0.343206,0.648955
...,...,...,...,...,...,...,...,...,...
1038,[OS OM OC][Len MFit CAtom][GNone],2,,0.023463,0.392713,0.692308,0.024482,0.375364,0.712900
1039,[OS OM OC][Len MFit CAtom][GNone],1,,0.035813,0.368700,0.673740,0.037424,0.349619,0.664975
1040,[OS OM OC][Len MFit CAtom][GNone],2,,0.023653,0.401606,0.682731,0.027142,0.370954,0.704287
1041,[OS OM OC][Len MFit CAtom][GNone],2,,0.025838,0.136029,0.338235,0.024649,0.157996,0.374759


In [5]:
import utils

scratch_path = r"C:\Users\gac8\PycharmProjects\PSSearch\retail_forecasting_data_collection\scratch_processing\processed_baseline.csv"


results_df.to_csv(scratch_path)

* read the data that has been stored as a csv, call it ps_df. 
* the columns of this df are code_name,length,genome_threshold,test_coverage,test_HR_20,test_HR_100,train_coverage,train_HR_20,train_HR_100
* for each codename, find the mean and standard deviation of the length, test_coverage, test_HR_20, test_HR_100, train_coverage,train_HR_20,train_HR_100
* put these results in a table, and save it as a csv where the path will be os.path.join(scratch_processing_dir, "summary_stats_"+utils.get_formatted_timestamp())

In [6]:
import pandas as pd
import os
import utils  # Make sure this is your own module and has get_formatted_timestamp()

scratch_processing_dir = r"C:\Users\gac8\PycharmProjects\PSSearch\retail_forecasting_data_collection\scratch_processing"
data_to_process_path = r"C:\Users\gac8\PycharmProjects\PSSearch\retail_forecasting_data_collection\scratch_processing\processed_baseline.csv"

# we also need to find how many pss were found for each run. At the same time, we may as well make the latex table


if True:
    # Read the data
    ps_df = pd.read_csv(data_to_process_path)  # Replace with actual path
    
    # Define columns to compute stats on
    metrics = [
        "length", "test_coverage", "test_HR_20", "test_HR_100",
        "train_coverage", "train_HR_20", "train_HR_100"
    ]
    
    # Group by 'code_name' and compute mean and std
    grouped = ps_df.groupby("code_name")[metrics].agg(["mean", "std"])
    
    # Flatten MultiIndex columns
    grouped.columns = ['_'.join(col).strip() for col in grouped.columns.values]
    
    # Reset index to make 'code_name' a column again
    summary_df = grouped.reset_index()
    
    # Compute average quantity of items produced (count / 100)  because there's a 100 runs per settings
    counts = ps_df['code_name'].value_counts().sort_index()
    summary_df["avg_quantity"] = counts.values / 100.0
    
    # Save the summary as CSV
    output_path = os.path.join(
        scratch_processing_dir,
        "summary_stats_" + utils.get_formatted_timestamp() + ".csv"
    )
    summary_df.to_csv(output_path, index=False)


In [7]:
import pandas as pd


if False:
    # Paths to your input CSV files
    csv_path_1 = r"C:\Users\gac8\PycharmProjects\PSSearch\retail_forecasting_data_collection\scratch_processing\processed_with_gt.csv"
    csv_path_2 = r"C:\Users\gac8\PycharmProjects\PSSearch\retail_forecasting_data_collection\scratch_processing\processed_without_gt.csv"
    
    # Read the CSV files, treating the first column as the index
    df1 = pd.read_csv(csv_path_1, index_col=0)
    df2 = pd.read_csv(csv_path_2, index_col=0)
    
    # Concatenate the two DataFrames
    combined_df = pd.concat([df1, df2], axis=0)
    
    # Optional: reset index if you want a new sequential index
    combined_df = combined_df.reset_index(drop=True)
    
    # Save the result
    combined_df.to_csv(r"C:\Users\gac8\PycharmProjects\PSSearch\retail_forecasting_data_collection\scratch_processing\processed_all.csv")
