In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import json
from glob import glob
from pathlib import Path
from tqdm import tqdm

In [29]:
base = "data/done-soon/temp/problem_output/"
all_normal_files = glob("data/problem_output/*NORMAL.json")
df = pd.DataFrame()
data = []


# find index of statistics array at certain percent of TL
def find_index_at_percent(stats, percent):
    left = 0
    right = len(stats) - 1
    while left < right:
        mid = (left + right) // 2
        
        if stats[mid] is not None:

            if stats[mid]['search_time'] == percent:
                # Found the first dictionary with target time
                while mid > 0 and stats[mid-1]['search_time'] == percent:
                    mid -= 1
                return mid
            elif stats[mid]['search_time'] < percent:
                left = mid + 1
            else:
                right = mid - 1
        else:
            return -1

    return -1


num_without_search_time = 0
for i, normal in tqdm(enumerate(all_normal_files)):
    mzn = normal[normal.find("MZN-")+4:normal.find("-DZN")] + ".mzn"
    dzn = normal[normal.find("DZN-")+4:normal.find("-OUTPUT")] + ".dzn"

    stats = Path(f"{normal[:-12]}-STATS.json")
    if stats.exists():
        with open(normal, 'r') as normal_output, open(f"{normal[:-12]}-STATS.json", 'r') as stats_output:
            line = normal_output.readline()
            if line: # don't read json from empty output
                
                normal_time = json.loads(line).get('time') # wall time
                stats_all_lines = [json.loads(line).get('statistics') for line in stats_output.readlines()]
                final_statistic = stats_all_lines[-1]
                
                
                
                if normal_time and final_statistic:
                    if "search_time" not in final_statistic.keys():
                            num_without_search_time += 1
                    else:
                            normal_time *= 0.001 # Convert from milliseconds to seconds

                            if normal_time <= 10:
                                continue

                            # To avoid loading in too much data into memory, only load the ones at certain percentages
                            # specifically, every half percent intervals
                            statistics_per_half_percent = []
                            for percent in range(1, 200):
                                percent /= 2
                                

                                # normal_time is in seconds, so this is percentage of two hours
                                wall_clock_time_at_percent = (60 * 60 * 2) * percent/100  

                                if wall_clock_time_at_percent >= final_statistic['search_time']: # no more data :(
                                    break 

                                statistics_per_half_percent  += [
                                    stats_all_lines[find_index_at_percent(stats_all_lines, wall_clock_time_at_percent)]
                                ]



                            data.append({
                                'normal_time': normal_time * 0.001,
                                'stat_time': final_statistic['search_time'],
                                'problem': normal,
                                'statistics': statistics_per_half_percent,
                                'mzn': mzn,
                                'dzn': dzn
                            })

df = pd.DataFrame(data)
all_data = df
all_data.shape

16814it [03:40, 76.23it/s] 


(1132, 6)

In [34]:
df

Unnamed: 0,normal_time,stat_time,problem,statistics,mzn,dzn
0,7.200882,7199.268198,data/problem_output/PROB-city-position-MZN-cit...,"[{'conflicts': 389380, 'ewma_conflicts': 38936...",city-position.mzn,city-7-04.dzn
1,7.210505,7121.926723,data/problem_output/PROB-2DBinPacking-MZN-2DPa...,"[{'conflicts': 631941, 'ewma_conflicts': 63192...",2DPacking.mzn,Class5_80_8.dzn
2,0.071478,213.285878,data/problem_output/PROB-amaze-MZN-amaze2-DZN-...,"[{'conflicts': 269053, 'ewma_conflicts': 26903...",amaze2.mzn,2012-07-05.dzn
3,7.201437,3512.053385,data/problem_output/PROB-ship-schedule-MZN-shi...,"[{'conflicts': 870438, 'ewma_conflicts': 87041...",ship-schedule.mip.mzn,8ShipsUnconst.dzn
4,7.201694,7199.867735,data/problem_output/PROB-miplib-MZN-ns4-pr6-DZ...,"[{'conflicts': 1141428, 'ewma_conflicts': 1141...",ns4-pr6.mzn,NO-MODEL-FILE.dzn
...,...,...,...,...,...,...
1127,0.010501,42.884265,data/problem_output/PROB-wwtpp-real-MZN-wwtpp-...,"[{'conflicts': 28032, 'ewma_conflicts': 28013....",wwtpp.mzn,ex02400_2400_100.dzn
1128,0.018754,229.142342,data/problem_output/PROB-rcpsp-wet-MZN-rcpsp-w...,"[{'conflicts': 350115, 'ewma_conflicts': 35009...",rcpsp-wet.mzn,j30_1_3-wet.dzn
1129,0.011433,45.670814,data/problem_output/PROB-2DBinPacking-MZN-2DPa...,"[{'conflicts': 91212, 'ewma_conflicts': 91193....",2DPacking.mzn,Class4_40_7.dzn
1130,0.016353,77.763970,data/problem_output/PROB-mrcpsp-MZN-mrcpsp-DZN...,"[{'conflicts': 204350, 'ewma_conflicts': 20433...",mrcpsp.mzn,j30_46_4.dzn


In [31]:
def cleanup(df):
    del df["decision_level_sat"]
    del df["ewma_decision_level_mip"]
    del df["decision_level_mip"]
#     del df["best_objective"]
#     df["unassnVar"]   = (2**df['vars']) - df['opennodes']
#     df["fracFailUnassn"]     = df['conflicts'] / df['unassnVar']         # num failures/ num open nodes
    df["fracOpenVisit"]  = (df['vars'] - df['opennodes']) / df['opennodes']       # ratio of open nodes to visited nodes (how much of soln space explored)
    df["fracBoolVars"]     = df['boolVars'] / df['vars']                 # num bools / total num of vars
    df["fracPropVars"]     = df['propagations'] / df['vars']        # num propagations/ total num of vars
#     df["frac_unassigned"] = df['unassnVar'] / df['vars']  # current assignments/ total vars
    df["fracLongClauses"] = df['long'] + df['bin'] + df['tern']         # fraction of learnt clauses that have more than 3 literals
    df["freqBackjumps"]  = df['back_jumps']/df['search_time']
    return df


def gradients(df_prev, df_curr):
    keys=['conflicts','ewma_conflicts','decisions','search_iterations','opennodes','ewma_opennodes',
          'vars','back_jumps','ewma_back_jumps','solutions','total_time','search_time','intVars',
          'propagations','sat_propagations','ewma_propagations','propagators','boolVars','learnt',
          'bin','tern','long','peak_depth','decision_level_engine','ewma_decision_level_engine',
          'decision_level_treesize','clause_mem','prop_mem','ewma_best_objective',
          'fracOpenVisit','fracBoolVars','fracPropVars','freqBackjumps', 'best_objective']
    for i in keys:
        df_curr[i+'_gradient']=(df_curr[i]-df_prev[i])/0.05*7200
    return df_curr

KeyError: 'search_time'

In [None]:
features_at_percent = {}


for i in tqdm(range(1,200)):
    df_i=[]
    for id, problem in all_data_filtered.iterrows():
        time_percent = (i / 2) * 7200
        index = find_index_at_percent(problem.statistics, time_percent)
        p = problem.statistics[index]
        # for index, p in enumerate(problem.statistics):
        #     if index == i:
        new_p = dict(p)
        new_p=cleanup(new_p)
        if i!=1:
            new_p=gradients(df_prev.loc[id], new_p)
        new_p['mzn'] = problem['mzn']
        new_p['dzn'] = problem['dzn']
        new_p['solved_within_time_limit'] = problem['normal_time'] < 7199 * 1000 \
        or np.logical_not(np.isnan(problem['normal_time']))
        df_i.append((id, new_p))

    df_i = pd.DataFrame([a[1] for a in df_i], index=[a[0] for a in df_i])
    df_i=df_i.fillna(value = 0)
    if i!=0:   
        features_at_percent[i]=df_i
    df_prev=df_i

