In [1]:
import wandb
from tqdm.auto import tqdm
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from human_eval.data import write_jsonl, read_problems

In [2]:
# Import utils from ../src/utils
import sys
sys.path.append('..')

In [3]:
from utils.completions import clean_completion, inference

In [4]:
api = wandb.Api()

In [5]:
artifact = api.artifact("marioparreno/cody/model_justtest:v0")
artifact_dir = artifact.download()
print("Model checkpoint downloaded!")

[34m[1mwandb[0m: Downloading large artifact model_justtest:v0, 68.74MB. 9 files... 
[34m[1mwandb[0m:   9 of 9 files downloaded.  
Done. 0:0:0.3


Model checkpoint downloaded!


In [6]:
model = AutoModelForCausalLM.from_pretrained(artifact_dir)
print("Model loaded successfully!")

Model loaded successfully!


In [7]:
tokenizer = AutoTokenizer.from_pretrained(artifact_dir)
print("Tokenizer loaded successfully!")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer loaded successfully!


# Human Eval

In [8]:
problems = read_problems()
print(f"[HumanEval] Loaded {len(problems)} problems")

[HumanEval] Loaded 164 problems


In [9]:
# we can generate more than one candidate per task
# later one pass@1, pass@10... will be used to evaluate the model
num_samples_per_task = 5
temperature = 0.8
max_output_tokens=256

In [10]:
results= []
for task_id in tqdm(problems):
    for _ in range(num_samples_per_task):
        prompt_text = problems[task_id]['prompt']
        response = inference(
            [prompt_text]*num_samples_per_task,
            model,
            tokenizer,
            max_output_tokens=max_output_tokens,
            temperature=temperature,
        )

        for i in range(num_samples_per_task):
            clean_response = clean_completion(response[i], tokenizer.eos_token, prompt_text)
            results.append({
                'task_id': task_id,
                'completion': clean_response,
            })

  0%|          | 0/164 [00:00<?, ?it/s]

In [None]:

# write the results under (f"checkpoints/{run.name}")
write_jsonl(f"human_eval-{num_samples_per_task}_results.jsonl", results)