In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import json
from glob import glob
from pathlib import Path
from tqdm import tqdm

In [29]:
base = "data/done-soon/temp/problem_output/"
all_normal_files = glob("data/problem_output/*NORMAL.json")
df = pd.DataFrame()
data = []


# find index of statistics array at certain percent of TL
def find_index_at_percent(stats, percent):
    left = 0
    right = len(stats) - 1
    while left < right:
        mid = (left + right) // 2
        
        if stats[mid] is not None:

            if stats[mid]['search_time'] == percent:
                # Found the first dictionary with target time
                while mid > 0 and stats[mid-1]['search_time'] == percent:
                    mid -= 1
                return mid
            elif stats[mid]['search_time'] < percent:
                left = mid + 1
            else:
                right = mid - 1
        else:
            return -1

    return -1


num_without_search_time = 0
for i, normal in tqdm(enumerate(all_normal_files)):
    mzn = normal[normal.find("MZN-")+4:normal.find("-DZN")] + ".mzn"
    dzn = normal[normal.find("DZN-")+4:normal.find("-OUTPUT")] + ".dzn"

    stats = Path(f"{normal[:-12]}-STATS.json")
    if stats.exists():
        with open(normal, 'r') as normal_output, open(f"{normal[:-12]}-STATS.json", 'r') as stats_output:
            line = normal_output.readline()
            if line: # don't read json from empty output
                
                normal_time = json.loads(line).get('time') # wall time
                stats_all_lines = [json.loads(line).get('statistics') for line in stats_output.readlines()]
                final_statistic = stats_all_lines[-1]
                
                
                
                if normal_time and final_statistic:
                    if "search_time" not in final_statistic.keys():
                            num_without_search_time += 1
                    else:
                            normal_time *= 0.001 # Convert from milliseconds to seconds

                            if normal_time <= 10:
                                continue

                            # To avoid loading in too much data into memory, only load the ones at certain percentages
                            # specifically, every half percent intervals
                            statistics_per_half_percent = []
                            for percent in range(1, 200):
                                percent /= 2
                                

                                # normal_time is in seconds, so this is percentage of two hours
                                wall_clock_time_at_percent = (60 * 60 * 2) * percent/100  

                                if wall_clock_time_at_percent >= final_statistic['search_time']: # no more data :(
                                    break 

                                statistics_per_half_percent  += [
                                    stats_all_lines[find_index_at_percent(stats_all_lines, wall_clock_time_at_percent)]
                                ]



                            data.append({
                                'normal_time': normal_time,
                                'stat_time': final_statistic['search_time'],
                                'problem': normal,
                                'statistics': statistics_per_half_percent,
                                'mzn': mzn,
                                'dzn': dzn
                            })

df = pd.DataFrame(data)
all_data = df
all_data.shape

16814it [03:42, 75.50it/s] 


(1132, 6)

In [30]:
df[df["normal_time"]*1000 > 3600].shape

(1132, 6)

In [31]:
log_axes = False
log_fig = False
scatter = px.scatter(
    df,
    x='normal_time',
    y='stat_time',
    labels={'x': 'Normal chuffed runtime', 'y': 'Modded chuffed runtime'},
    title="Runtime of Normal Chuffed vs. Modded Chuffed",
    log_x=log_axes,
    log_y=log_axes,
    hover_data='problem'
    # trendline='ols',
    # trendline_options=dict(log_x=fit_log, log_y=fit_log),
)
scatter.layout.update(showlegend=False)
scatter

In [33]:
df[df['normal_time'] > 3600].shape

(502, 6)

In [19]:
base = "data/done-soon/temp/problem_output/"
all_normal_files = glob("data/problem_output/*-black-hole*-18-*NORMAL.json")
xs = []
ys = []
stat_times = []

for i, normal in enumerate(all_normal_files):
    stats = Path(f"{normal[:-12]}-STATS.json")
    if stats.exists():
        with open(normal, 'r') as normal_output, open(f"{normal[:-12]}-STATS.json", 'r') as stats_output:
            normal_time = json.loads(normal_output.readline()).get('time')
            stats_lines = stats_output.readlines()
            stats = json.loads(stats_lines[-1]).get('statistics')
            
            if normal_time and stats:
                if normal_time * 0.001 > 10:
                    xs.append(normal_time * 0.001)
                    ys.append(stats['search_time'])
                for line in stats_lines:
                    stats = json.loads(line).get('statistics')
                    if stats:
                        stat_times.append(stats['search_time'])

                    


In [34]:
px.scatter(y=stat_times, labels={"x": "data point", "y": "search_time value"})

In [35]:
import plotly.graph_objects as go

In [36]:
base = "data/benchmarks/solve/"
all_normal_files = glob(f"{base}/*NORMAL.tsv")
data = []


for i, normal in enumerate(all_normal_files):
    stats = Path(f"{normal[:-11]}-STATS.tsv")
    if stats.exists():
        with open(normal, 'r') as normal_output, open(stats, 'r') as stats_output:
            normal_time = float(normal_output.readlines()[-1].split('\t')[0])
            stat_time = float(stats_output.readlines()[-1].split('\t')[0])

            if normal_time and stats:
                data.append({
                    'normal_time': normal_time,
                    'stat_time': stat_time,
                    'problem': normal
                })
df_bench = pd.DataFrame(data)
df_bench.shape

(14685, 3)

In [37]:
df_bench_filtered = df_bench  # [df_bench['stat_time'] < 7100]

In [38]:
lower = df_bench_filtered[(df_bench_filtered["normal_time"] > 10) & (df_bench_filtered["normal_time"] < 3600)].shape
above = df_bench_filtered[df_bench_filtered["normal_time"] > 3600].shape
lower[0], above[0]

(700, 480)

In [39]:
show_log = True
fit_log = True
fig = px.scatter(
    df_bench_filtered,
    x='normal_time',
    y='stat_time',
    trendline='ols',
    log_x=show_log,
    log_y=show_log,
    trendline_options=dict(log_x=fit_log, log_y=fit_log),
    width=1000,
    height=1000
)
results = px.get_trendline_results(fig)
# fig.add_trace(px.line(x=[0, 7200], y=[0, 7200]).data[0])
# fig['data'][2]['line']['color']='rgb(255, 0, 0)'
# fig.update_yaxes(
#     range=[-300,7500]
# )
# fig.update_xaxes(
#     range=[-300,7500]
# )
fig

In [40]:
results.px_fit_results.iloc[0].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.914
Model:,OLS,Adj. R-squared:,0.914
Method:,Least Squares,F-statistic:,155700.0
Date:,"Sat, 22 Apr 2023",Prob (F-statistic):,0.0
Time:,05:00:50,Log-Likelihood:,711.31
No. Observations:,14685,AIC:,-1419.0
Df Residuals:,14683,BIC:,-1403.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0747,0.002,33.710,0.000,0.070,0.079
x1,1.0258,0.003,394.539,0.000,1.021,1.031

0,1,2,3
Omnibus:,4221.714,Durbin-Watson:,1.638
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27134.709
Skew:,1.224,Prob(JB):,0.0
Kurtosis:,9.193,Cond. No.,1.8
