In [1]:
import wandb
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from human_eval.data import write_jsonl, read_problems

In [2]:
# Import utils from ../src/utils
import sys
sys.path.append('..')

In [3]:
from utils.completions import clean_completion, inference

------

In [4]:
MODEL_ID = "db834bca-c0cf-430d-a2ad-b56d7bef2f2d"

In [5]:
api = wandb.Api()

In [6]:
artifact = api.artifact(f"marioparreno/cody/model_{MODEL_ID}:v0")
artifact_dir = artifact.download()
print("Model checkpoint downloaded!")

[34m[1mwandb[0m: Downloading large artifact model_db834bca-c0cf-430d-a2ad-b56d7bef2f2d:v0, 70.26MB. 8 files... 
[34m[1mwandb[0m:   8 of 8 files downloaded.  
Done. 0:0:0.5


Model checkpoint downloaded!


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    artifact_dir,
    quantization_config=bnb_config,
    device_map={"":0},
    trust_remote_code=True
)
print("Model loaded successfully!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully!


In [8]:
tokenizer = AutoTokenizer.from_pretrained(artifact_dir)
print("Tokenizer loaded successfully!")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer loaded successfully!


In [9]:
# Move all to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Model moved to {device}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 7.78 GiB of which 53.25 MiB is free. Including non-PyTorch memory, this process has 7.03 GiB memory in use. Of the allocated memory 6.87 GiB is allocated by PyTorch, and 5.02 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Human Eval

In [None]:
problems = read_problems()
print(f"[HumanEval] Loaded {len(problems)} problems")

In [None]:
# we can generate more than one candidate per task
# later one pass@1, pass@10... will be used to evaluate the model
num_samples_per_task = 5
temperature = 0.75
max_output_tokens=256

In [None]:
results= []
for task_id in tqdm(problems):
    for _ in range(num_samples_per_task):
        prompt_text = problems[task_id]['prompt']
        response = inference(
            [prompt_text]*num_samples_per_task,
            model,
            tokenizer,
            max_output_tokens=max_output_tokens,
            temperature=temperature,
        )

        for i in range(num_samples_per_task):
            clean_response = clean_completion(response[i], tokenizer.eos_token, prompt_text)
            results.append({
                'task_id': task_id,
                'completion': clean_response,
            })

In [None]:

# write the results under (f"checkpoints/{run.name}")
write_jsonl(f"human_eval-{num_samples_per_task}_{MODEL_ID}_results.jsonl", results)