# Evaluate Models 

- enumerative search
- neural guided search
- CodeT5
- LibT5

In [1]:
import datetime
import os
import random
import pandas as pd
import numpy as np
import operator
import random
from tqdm import tqdm

try:
    import binutil  # required to import from dreamcoder modules
except ModuleNotFoundError:
    import bin.binutil  # alt import if called as module

from dreamcoder.task import Task
from dreamcoder.dreamcoder import *
from dreamcoder.domains.minigrid.primitives import basePrimitives, tmap, taction, idx_to_action, tdirection
from dreamcoder.grammar import Grammar
from dreamcoder.utilities import testTrainSplit, eprint, numberOfCPUs
from dreamcoder.type import arrow
from dreamcoder.domains.minigrid.nn_model_maze import *
from dreamcoder.dreamcoder import commandlineArguments
from dreamcoder.utilities import numberOfCPUs
import transformers
from transformers import RobertaTokenizer, T5ForConditionalGeneration, AutoTokenizer, TrainingArguments, Seq2SeqTrainer
from bin.maze_T5 import parseData, all_equal
Grammar.uniform(basePrimitives())
os.environ["WANDB_PROJECT"] = "T5-Minigrid-Maze"

def makeTasks(data, chunkSize):
    keys = data.groups.keys()
    print('keys:', len(keys))
    tasks = []
    for key in keys:
        to_imitate = data.get_group(key)
        examples = []
        part = 0
        for _, row in to_imitate.iterrows():
            input_ex = (row.obs.astype(int).tolist(), int(row['obs direction'],))
            output_ex = int(row.action)
            examples.append((input_ex, output_ex))
            if chunkSize > 0 and chunkSize <= len(examples):
                # we check that the chosen actions are not all the same
                # otherwise it is too easy to find a program if all actions/output examples are the same
                # this results in programs such as (lambda (lambda forward-action))
                all_chosen_actions = list(zip(*examples))[1]
                if not all_equal(all_chosen_actions):
                    tasks.append(Task(f'perfect maze {key} size {chunkSize} part {part}',
                                 arrow(tmap, tdirection, taction), examples))
                    part += 1
                    # we reset examples and add new chunkSize taskss
                    examples = []

    print(f'Created {len(tasks)} tasks with {chunkSize} chunk size')
    return tasks

In [2]:
data_file = "/home/ma/e/eberhardinger/workspaces/ec/dreamcoder/domains/perfect-maze-minigrid/collected_data/2022-12-10T15:26:33.798573.npy"

def evaluate_enumerative_search(testingTasks, path):
    with open(path, "rb") as handle:
        result = dill.load(handle)
    resume = len(result.grammars) - 1
    eprint("Loaded checkpoint from", path)
    grammar = result.grammars[-1] if result.grammars else grammar
    args = commandlineArguments(
        enumerationTimeout=720,
        structurePenalty=1.5,
        recognitionSteps=5000,
        biasOptimal=False,
        contextual=False,
        a=3,
        topK=5,
        iterations=1,
        useRecognitionModel=True,
        helmholtzRatio=0.5,
        featureExtractor=MinigridMazeFeatureExtractor,
        maximumFrontier=10,
        CPUs=numberOfCPUs(),
        pseudoCounts=30.0,
        extras=None)
    times = evaluateOnTestingTasks(result, testingTasks, grammar,
                           CPUs=args.get('CPUs'), maximumFrontier=args.get('maximumFrontier'),
                           solver=args.get('solver'),
                           enumerationTimeout=args.get('enumerationTimeout'), evaluationTimeout=args.get('enumerationTimeout'))

    return times

def evaluate_T5(testingTasks, path, no_spaces=True, compress=False):
    testTasks = createTestDataFromTasks(testingTasks, True, no_spaces=no_spaces, compress=compress)
    #load model

def evaluate_model(data_file, path, method, results_path):
    data = np.load(data_file, allow_pickle=True)
    parsed_data = parseData(data)
    sequence_lengths = range(5, 31)
    solved_tasks = []
    idx = []
    for i in sequence_lengths:
        tasks = makeTasks(parsed_data, i)
        hits = method(tasks, path)
        solved_tasks.append({
            'solved': hits,
            'tasks': len(tasks)
        })
        idx.append(i)
        df = pd.DataFrame(solved_tasks, index=idx)
        df.to_csv(results_path)  
    return df

In [None]:
path = '../../../experimentOutputs/perfect-maze/2023-03-01T21:33:58.380983/maze_aic=1.0_arity=3_ET=720_it=39_MF=10_noConsolidation=False_pc=30.0_RS=10000_RW=False_solver=ocaml_STM=True_L=1.5_TRR=default_K=5_topkNotMAP=False_rec=False.pickle'
df = evaluate_model(data_file, path, evaluate_enumerative_search, '/home/ma/e/eberhardinger/workspaces/ec/eval_enum_search.csv')

(1721, 6)
keys: 18
Created 320 tasks with 5 chunk size


Loaded checkpoint from ../../../experimentOutputs/perfect-maze/2023-03-01T21:33:58.380983/maze_aic=1.0_arity=3_ET=720_it=39_MF=10_noConsolidation=False_pc=30.0_RS=10000_RW=False_solver=ocaml_STM=True_L=1.5_TRR=default_K=5_topkNotMAP=False_rec=False.pickle


recogModel None
CUDA is available?: True
using cuda?: True


(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
0.000000, 1.500000, 720.000000s
	(ocaml: 1 CPUs. shatter: 1. |fringe| = 1. |finished| = 0.)
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
0.000000, 1.500000, 720.000000s
	(ocaml: 1 CPUs. shatter: 1. |fringe| = 1. |finished| = 0.)
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
0.000000, 1.

keys: 18
Created 274 tasks with 6 chunk size


Loaded checkpoint from ../../../experimentOutputs/perfect-maze/2023-03-01T21:33:58.380983/maze_aic=1.0_arity=3_ET=720_it=39_MF=10_noConsolidation=False_pc=30.0_RS=10000_RW=False_solver=ocaml_STM=True_L=1.5_TRR=default_K=5_topkNotMAP=False_rec=False.pickle


recogModel None
CUDA is available?: True
using cuda?: True


(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.0.000000, 1.500000, 720.000000s
	(ocaml: 1 CPUs. shatter: 1. |fringe| = 1. |finished| = 0.)

(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
0.000000, 1.500000, 720.000000s
	(ocaml: 1 CPUs. shatter: 1. |fringe| = 1. |finished| = 0.)
0.000000, 1.

keys: 18
Created 237 tasks with 7 chunk size


Loaded checkpoint from ../../../experimentOutputs/perfect-maze/2023-03-01T21:33:58.380983/maze_aic=1.0_arity=3_ET=720_it=39_MF=10_noConsolidation=False_pc=30.0_RS=10000_RW=False_solver=ocaml_STM=True_L=1.5_TRR=default_K=5_topkNotMAP=False_rec=False.pickle


recogModel None
CUDA is available?: True
using cuda?: True


(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
0.000000, 1.500000, 720.000000s
	(ocaml: 1 CPUs. shatter: 1. |fringe| = 1. |finished| = 0.)
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
0.000000, 1.500000, 720.000000s
	(ocaml: 1 CPUs. shatter: 1. |fringe| = 1. |finished| = 0.)
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
0.000000, 1.

keys: 18
Created 206 tasks with 8 chunk size


Loaded checkpoint from ../../../experimentOutputs/perfect-maze/2023-03-01T21:33:58.380983/maze_aic=1.0_arity=3_ET=720_it=39_MF=10_noConsolidation=False_pc=30.0_RS=10000_RW=False_solver=ocaml_STM=True_L=1.5_TRR=default_K=5_topkNotMAP=False_rec=False.pickle


recogModel None
CUDA is available?: True
using cuda?: True


(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
0.000000, 1.500000, 720.000000s
	(ocaml: 1 CPUs. shatter: 1. |fringe| = 1. |finished| = 0.)
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
0.000000, 1.500000, 720.000000s
	(ocaml: 1 CPUs. shatter: 1. |fringe| = 1. |finished| = 0.)
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
0.000000, 1.

keys: 18
Created 181 tasks with 9 chunk size


Loaded checkpoint from ../../../experimentOutputs/perfect-maze/2023-03-01T21:33:58.380983/maze_aic=1.0_arity=3_ET=720_it=39_MF=10_noConsolidation=False_pc=30.0_RS=10000_RW=False_solver=ocaml_STM=True_L=1.5_TRR=default_K=5_topkNotMAP=False_rec=False.pickle


recogModel None
CUDA is available?: True
using cuda?: True


(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
0.000000, 1.500000, 720.000000s
	(ocaml: 1 CPUs. shatter: 1. |fringe| = 1. |finished| = 0.)
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 1.500000. Timeout 720.000000.
(frontend) Launching array(array(tobj)) -> tdirection -> taction (1 tasks) w/ 1 CPUs. 0.000000 <= MDL < 

In [None]:
pd.read_csv('/home/ma/e/eberhardinger/workspaces/ec/eval_enum_search.csv') 

In [None]:
df

In [3]:
data = np.load(data_file, allow_pickle=True)
parsed_data = parseData(data)

(1721, 6)


In [5]:
tasks = makeTasks(parsed_data, 10)

keys: 18
Created 163 tasks with 10 chunk size


In [12]:
for examples in tasks[0].examples:
    inp = str(examples[0])
    out = examples[1]
    #inp_string = get_inp_string_for_task(inp, no_spaces=no_spaces, compress=compress)
    #out_string = get_out_string(out)

In [13]:
print(inp)

([[1, 2, 1, 1, 1], [1, 2, 1, 2, 2], [1, 1, 1, 2, 1], [2, 2, 1, 2, 1], [1, 1, 1, 2, 1]], 1)
