# Evaluate Models 

- enumerative search
- neural guided search
- CodeT5
- LibT5

In [1]:
import datetime
import os
import random
import pandas as pd
import numpy as np
import operator
import random
from tqdm import tqdm
import dill
try:
    import binutil  # required to import from dreamcoder modules
except ModuleNotFoundError:
    import bin.binutil  # alt import if called as module

from dreamcoder.task import Task
from dreamcoder.dreamcoder import *
from dreamcoder.domains.minigrid.primitives import basePrimitives, tmap, taction, idx_to_action, tdirection
from dreamcoder.grammar import Grammar
from dreamcoder.utilities import testTrainSplit, eprint, numberOfCPUs
from dreamcoder.type import arrow
from dreamcoder.domains.minigrid.nn_model_maze import *
from dreamcoder.dreamcoder import commandlineArguments
from dreamcoder.utilities import numberOfCPUs
import transformers
from transformers import RobertaTokenizer, T5ForConditionalGeneration, AutoTokenizer, TrainingArguments, Seq2SeqTrainer
from bin.maze_T5 import parseData, all_equal, createTestDataFromTasks, get_latest_checkpoint_path, LookupTableCollator, run_on_input_examples
Grammar.uniform(basePrimitives())
os.environ["WANDB_PROJECT"] = "T5-Minigrid-Maze"

def makeTasks(data, chunkSize):
    keys = data.groups.keys()
    print('keys:', len(keys))
    tasks = []
    for key in keys:
        to_imitate = data.get_group(key)
        examples = []
        part = 0
        for _, row in to_imitate.iterrows():
            input_ex = (row.obs.astype(int).tolist(), int(row['obs direction'],))
            output_ex = int(row.action)
            examples.append((input_ex, output_ex))
            if chunkSize > 0 and chunkSize <= len(examples):
                # we check that the chosen actions are not all the same
                # otherwise it is too easy to find a program if all actions/output examples are the same
                # this results in programs such as (lambda (lambda forward-action))
                all_chosen_actions = list(zip(*examples))[1]
                if not all_equal(all_chosen_actions):
                    tasks.append(Task(f'perfect maze {key} size {chunkSize} part {part}',
                                 arrow(tmap, tdirection, taction), examples))
                    part += 1
                    # we reset examples and add new chunkSize taskss
                    examples = []

    print(f'Created {len(tasks)} tasks with {chunkSize} chunk size')
    return tasks

In [2]:
data_file = "/home/ma/e/eberhardinger/workspaces/ec/dreamcoder/domains/perfect-maze-minigrid/collected_data/2022-12-10T15:26:33.798573.npy"

def generate_samples_with_temp(model, tokenizer, collator, txt, n_samples, temp):
    to_tokenizer = [txt for i in range(n_samples)]
    outputs = model.generate(collator.encode_obs(to_tokenizer).to(
        'cuda'), do_sample=True, max_length=128, temperature=temp)
    results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return results

def test_programs_on_task(model, tokenizer, collator, task, grammar, n=5, temp=1.0, verbose=False):
    progs = generate_samples_with_temp(model, tokenizer, collator, task[0], n, temp)
    found_progs = []
    for i, prog in enumerate(progs):
        if verbose:
            eprint(prog)
        log_prior = run_on_input_examples(task[1], prog, grammar, verbose=verbose)
        if log_prior is not None:
            found_progs.append((Program.parse(prog), log_prior))

    if len(found_progs) == 0:
        return None, -1

    found_progs.sort(key=lambda x: x[1], reverse=True)

    best = found_progs[0]
    return best[0], best[1]

def check_test_tasks(model, tokenizer, collator, testTasks, grammar, n_sampling=100, verbose=False):
    stats = []
    solved = 0
    processed = 0
    for tt in (pbar := tqdm(testTasks)):
        p, n = test_programs_on_task(model, tokenizer, collator, tt, grammar, n=n_sampling, verbose=verbose)
        stats.append((p, tt))
        processed += 1
        if p is not None:
            solved += 1
        pbar.set_description(f"Rate {solved}/{processed}")
    return stats

def evaluate_enumerative_search(testingTasks, path):
    with open(path, "rb") as handle:
        result = dill.load(handle)
    resume = len(result.grammars) - 1
    eprint("Loaded checkpoint from", path)
    grammar = result.grammars[-1] if result.grammars else grammar
    args = commandlineArguments(
        enumerationTimeout=720,
        structurePenalty=1.5,
        recognitionSteps=5000,
        biasOptimal=False,
        contextual=False,
        a=3,
        topK=5,
        iterations=1,
        useRecognitionModel=True,
        helmholtzRatio=0.5,
        featureExtractor=MinigridMazeFeatureExtractor,
        maximumFrontier=10,
        CPUs=numberOfCPUs(),
        pseudoCounts=30.0,
        extras=None)
    times = evaluateOnTestingTasks(result, testingTasks, grammar,
                           CPUs=args.get('CPUs'), maximumFrontier=args.get('maximumFrontier'),
                           solver=args.get('solver'),
                           enumerationTimeout=args.get('enumerationTimeout'), evaluationTimeout=args.get('enumerationTimeout'))

    return times

def evaluate_T5(testingTasks, path, no_spaces=True, compress=False):
    testTasks = createTestDataFromTasks(testingTasks, True, no_spaces=no_spaces, compress=compress)
    checkpoint_dir = get_latest_checkpoint_path(path)
    model = T5ForConditionalGeneration.from_pretrained(checkpoint_dir).to('cuda')
    tokenizer = RobertaTokenizer.from_pretrained(checkpoint_dir)
    collator = LookupTableCollator(tokenizer)
    grammar_file = os.path.join(path, 'results.pkl')
    with open(grammar_file, 'rb') as handle:
        result = dill.load(handle)
    grammar = [g['grammar'] for g in result.values()][-1]
    stats = check_test_tasks(model, tokenizer, collator, testTasks, grammar, n_sampling=100, verbose=False)
    solved = [x for x in stats if x[0] is not None]
    return len(solved)
    

def evaluate_model(data_file, path, method, results_path):
    data = np.load(data_file, allow_pickle=True)
    parsed_data = parseData(data)
    sequence_lengths = range(5, 31)
    solved_tasks = []
    idx = []
    for i in sequence_lengths:
        tasks = makeTasks(parsed_data, i)
        hits = method(tasks, path)
        solved_tasks.append({
            'solved': hits,
            'tasks': len(tasks)
        })
        idx.append(i)
        df = pd.DataFrame(solved_tasks, index=idx)
        df.to_csv(results_path)  
    return df

In [3]:
# enum search
path = '../../../experimentOutputs/perfect-maze/2023-03-01T21:33:58.380983/maze_aic=1.0_arity=3_ET=720_it=39_MF=10_noConsolidation=False_pc=30.0_RS=10000_RW=False_solver=ocaml_STM=True_L=1.5_TRR=default_K=5_topkNotMAP=False_rec=False.pickle'

path = [#'/home/ma/e/eberhardinger/workspaces/T5-experimens/flip-data/', # T5 with lib learning
        '/home/ma/e/eberhardinger/workspaces/T5-experimens/no-lib-learning/'] # without lib

for p in path:
    evaluate_model(data_file, p, evaluate_T5, os.path.join(p, 'eval.csv'))

(1721, 6)
keys: 18
Created 320 tasks with 5 chunk size


Rate 189/320: 100%|██████████| 320/320 [1:43:08<00:00, 19.34s/it]


keys: 18
Created 274 tasks with 6 chunk size


Rate 130/274: 100%|██████████| 274/274 [1:51:22<00:00, 24.39s/it]


keys: 18
Created 237 tasks with 7 chunk size


Rate 94/237: 100%|██████████| 237/237 [1:38:48<00:00, 25.02s/it]  


keys: 18
Created 206 tasks with 8 chunk size


Rate 65/206: 100%|██████████| 206/206 [1:53:28<00:00, 33.05s/it]


keys: 18
Created 181 tasks with 9 chunk size


Rate 59/181: 100%|██████████| 181/181 [55:56<00:00, 18.55s/it]


keys: 18
Created 163 tasks with 10 chunk size


Rate 15/58:  36%|███▌      | 58/163 [30:57<58:55, 33.67s/it]  $0 Not in candidates
Candidates is {empty-obj: (0.0, tobj, Context(next = 1, {t0 ||> taction})), wall-obj: (0.0, tobj, Context(next = 1, {t0 ||> taction})), goal-obj: (0.0, tobj, Context(next = 1, {t0 ||> taction})), if: (0.0, bool -> tobj -> tobj -> tobj, Context(next = 2, {t1 ||> tobj, t0 ||> taction})), get: (0.0, array(array(tobj)) -> int -> int -> tobj, Context(next = 1, {t0 ||> taction}))}
request is tobj
xs []
environment [tdirection, array(array(tobj))]
Rate 40/163: 100%|██████████| 163/163 [1:18:43<00:00, 28.98s/it]


keys: 18
Created 150 tasks with 11 chunk size


Rate 24/110:  73%|███████▎  | 110/150 [43:36<22:23, 33.60s/it]PANIC: not enough arguments for the type
request bool
tp tobj -> tobj -> bool
expression (eq-obj? wall-obj)
xs [wall-obj]
argumentTypes [tobj, tobj]
PANIC: Grammar failure, exporting to  failures/grammarFailure1680155717.2207563.pickle
Rate 32/150: 100%|██████████| 150/150 [1:05:03<00:00, 26.03s/it]


keys: 18
Created 135 tasks with 12 chunk size


Rate 20/135: 100%|██████████| 135/135 [43:59<00:00, 19.55s/it]


keys: 18
Created 125 tasks with 13 chunk size


Rate 20/125: 100%|██████████| 125/125 [1:02:41<00:00, 30.09s/it]


keys: 18
Created 114 tasks with 14 chunk size


Rate 17/114: 100%|██████████| 114/114 [37:12<00:00, 19.58s/it]


keys: 18
Created 106 tasks with 15 chunk size


Rate 12/106: 100%|██████████| 106/106 [57:57<00:00, 32.81s/it]


keys: 18
Created 100 tasks with 16 chunk size


Rate 0/2:   2%|▏         | 2/100 [01:03<51:06, 31.30s/it]direction-2 Not in candidates
Candidates is {empty-obj: (0.0, tobj, Context(next = 3, {t2 ||> bool, t1 ||> taction, t0 ||> taction})), wall-obj: (0.0, tobj, Context(next = 3, {t2 ||> bool, t1 ||> taction, t0 ||> taction})), goal-obj: (0.0, tobj, Context(next = 3, {t2 ||> bool, t1 ||> taction, t0 ||> taction})), if: (0.0, bool -> tobj -> tobj -> tobj, Context(next = 4, {t3 ||> tobj, t2 ||> bool, t1 ||> taction, t0 ||> taction})), get: (0.0, array(array(tobj)) -> int -> int -> tobj, Context(next = 3, {t2 ||> bool, t1 ||> taction, t0 ||> taction}))}
request is tobj
xs []
environment [tdirection, array(array(tobj))]
Rate 10/100: 100%|██████████| 100/100 [31:44<00:00, 19.04s/it]


keys: 18
Created 92 tasks with 17 chunk size


Rate 6/92: 100%|██████████| 92/92 [41:16<00:00, 26.92s/it]


keys: 18
Created 87 tasks with 18 chunk size


Rate 6/87: 100%|██████████| 87/87 [41:17<00:00, 28.47s/it]


keys: 18
Created 80 tasks with 19 chunk size


Rate 7/80: 100%|██████████| 80/80 [32:02<00:00, 24.03s/it]


keys: 18
Created 78 tasks with 20 chunk size


Rate 2/78: 100%|██████████| 78/78 [41:27<00:00, 31.89s/it]


keys: 18
Created 76 tasks with 21 chunk size


Rate 2/76: 100%|██████████| 76/76 [25:50<00:00, 20.41s/it]


keys: 18
Created 70 tasks with 22 chunk size


Rate 1/70: 100%|██████████| 70/70 [38:51<00:00, 33.30s/it]


keys: 18
Created 63 tasks with 23 chunk size


Rate 2/63: 100%|██████████| 63/63 [19:12<00:00, 18.29s/it]


keys: 18
Created 63 tasks with 24 chunk size


Rate 1/63: 100%|██████████| 63/63 [35:02<00:00, 33.37s/it]


keys: 18
Created 61 tasks with 25 chunk size


Rate 1/61: 100%|██████████| 61/61 [20:30<00:00, 20.17s/it]


keys: 18
Created 60 tasks with 26 chunk size


Rate 1/60: 100%|██████████| 60/60 [32:16<00:00, 32.27s/it]


keys: 18
Created 54 tasks with 27 chunk size


Rate 1/54: 100%|██████████| 54/54 [26:40<00:00, 29.63s/it]


keys: 18
Created 52 tasks with 28 chunk size


Rate 0/52: 100%|██████████| 52/52 [10:45<00:00, 12.41s/it]


keys: 18
Created 51 tasks with 29 chunk size


Rate 1/51: 100%|██████████| 51/51 [27:48<00:00, 32.72s/it]


keys: 18
Created 48 tasks with 30 chunk size


Rate 0/48: 100%|██████████| 48/48 [20:26<00:00, 25.55s/it]


In [4]:
pd.read_csv('/home/ma/e/eberhardinger/workspaces/T5-experimens/no-lib-learning/eval.csv') 

Unnamed: 0.1,Unnamed: 0,solved,tasks
0,5,272,320
1,6,216,274
2,7,169,237
3,8,129,206
4,9,105,181
5,10,78,163
6,11,69,150
7,12,45,135
8,13,38,125
9,14,27,114


In [1]:
import datetime
import os
import random
import pandas as pd
import numpy as np
import operator
import random
from tqdm import tqdm
import dill
try:
    import binutil  # required to import from dreamcoder modules
except ModuleNotFoundError:
    import bin.binutil  # alt import if called as module

from dreamcoder.task import Task
from dreamcoder.dreamcoder import *
from dreamcoder.domains.minigrid.primitives import basePrimitives, tmap, taction, idx_to_action, tdirection
from dreamcoder.grammar import Grammar
from dreamcoder.utilities import testTrainSplit, eprint, numberOfCPUs
from dreamcoder.type import arrow
from dreamcoder.domains.minigrid.nn_model_maze import *
from dreamcoder.dreamcoder import commandlineArguments
from dreamcoder.utilities import numberOfCPUs
import transformers
from transformers import RobertaTokenizer, T5ForConditionalGeneration, AutoTokenizer, TrainingArguments, Seq2SeqTrainer
from bin.maze_T5 import parseData, all_equal, createTestDataFromTasks, get_latest_checkpoint_path, LookupTableCollator, run_on_input_examples
Grammar.uniform(basePrimitives())


def makeTasks(data, chunkSize):
    keys = data.groups.keys()
    print('keys:', len(keys))
    tasks = []
    for key in keys:
        to_imitate = data.get_group(key)
        examples = []
        part = 0
        for _, row in to_imitate.iterrows():
            input_ex = (row.obs.astype(int).tolist(),
                        int(row['obs direction'],))
            output_ex = int(row.action)
            examples.append((input_ex, output_ex))
            if chunkSize > 0 and chunkSize <= len(examples):
                # we check that the chosen actions are not all the same
                # otherwise it is too easy to find a program if all actions/output examples are the same
                # this results in programs such as (lambda (lambda forward-action))
                all_chosen_actions = list(zip(*examples))[1]
                if not all_equal(all_chosen_actions):
                    tasks.append(Task(f'perfect maze {key} size {chunkSize} part {part}',
                                 arrow(tmap, tdirection, taction), examples))
                    part += 1
                    # we reset examples and add new chunkSize taskss
                    examples = []

    print(f'Created {len(tasks)} tasks with {chunkSize} chunk size')
    return tasks


def generate_samples_with_temp(model, tokenizer, collator, txt, n_samples, temp):
    to_tokenizer = [txt for i in range(n_samples)]
    outputs = model.generate(collator.encode_obs(to_tokenizer).to(
        'cuda'), do_sample=True, max_length=128, temperature=temp)
    results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return results


def test_programs_on_task(model, tokenizer, collator, task, grammar, n=5, temp=1.0, verbose=False):
    progs = generate_samples_with_temp(
        model, tokenizer, collator, task[0], n, temp)
    found_progs = []
    for i, prog in enumerate(progs):
        if verbose:
            eprint(prog)
        log_prior = run_on_input_examples(
            task[1], prog, grammar, verbose=verbose)
        if log_prior is not None:
            found_progs.append((Program.parse(prog), log_prior))

    if len(found_progs) == 0:
        return None, -1

    found_progs.sort(key=lambda x: x[1], reverse=True)

    best = found_progs[0]
    return best[0], best[1]


def check_test_tasks(model, tokenizer, collator, testTasks, grammar, n_sampling=100, verbose=False):
    stats = []
    solved = 0
    processed = 0
    for tt in (pbar := tqdm(testTasks)):
        p, n = test_programs_on_task(
            model, tokenizer, collator, tt, grammar, n=n_sampling, verbose=verbose)
        stats.append((p, tt))
        processed += 1
        if p is not None:
            solved += 1
        pbar.set_description(f"Rate {solved}/{processed}")
    return stats


def evaluate_T5(testingTasks, path, no_spaces=True, compress=False):
    testTasks = createTestDataFromTasks(
        testingTasks, True, no_spaces=no_spaces, compress=compress)
    checkpoint_dir = get_latest_checkpoint_path(path)
    model = T5ForConditionalGeneration.from_pretrained(
        checkpoint_dir).to('cuda')
    tokenizer = RobertaTokenizer.from_pretrained(checkpoint_dir)
    collator = LookupTableCollator(tokenizer)
    grammar_file = os.path.join(path, 'results.pkl')
    with open(grammar_file, 'rb') as handle:
        result = dill.load(handle)
    grammar = [g['grammar'] for g in result.values()][-1]
    stats = check_test_tasks(model, tokenizer, collator,
                             testTasks, grammar, n_sampling=100, verbose=False)
    solved = [x for x in stats if x[0] is not None]
    return len(solved)


def evaluate_model(data_file, path, results_path):
    # first check if a csv exists and load the csv then and start after last seq lenght...
    solved_tasks = []
    idx = []
    start_iter = 5
    if os.path.exists(results_path):
        df = pd.read_csv(results_path, index_col=0)
        idx = list(df.index)
        solved, all_tasks = df.to_dict('list').values()
        for s, a in zip(solved, all_tasks):
            solved_tasks.append({
                'solved': s,
                'tasks': a
            })
        print(f'loaded from {results_path}')
        print('start from found csv file:', solved_tasks)
        print('index:', idx)
        start_iter = idx[-1] + 1

    sequence_lengths = range(start_iter, 31)
    data = np.load(data_file, allow_pickle=True)
    parsed_data = parseData(data)
    for i in sequence_lengths:
        tasks = makeTasks(parsed_data, i)
        hits = evaluate_T5(tasks, path)
        solved_tasks.append({
            'solved': hits,
            'tasks': len(tasks)
        })
        idx.append(i)
        df = pd.DataFrame(solved_tasks, index=idx)
        df.to_csv(results_path)
    return df


if __name__ == '__main__':
    data_file = "/home/ma/e/eberhardinger/workspaces/ec/dreamcoder/domains/perfect-maze-minigrid/collected_data/2022-12-10T15:26:33.798573.npy"

    path = '/home/ma/e/eberhardinger/workspaces/T5-experimens/new-dsl/'
    path = '/home/ma/e/eberhardinger/workspaces/T5-experimens/noLib-newDsl/'
    
    Grammar.uniform(basePrimitives())
    df = evaluate_model(data_file, path, os.path.join(path, 'eval.csv'))


(1721, 6)
keys: 18
Created 320 tasks with 5 chunk size


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx