Installing dependencies. You might need to tweak the CMAKE_ARGS for the `llama-cpp-python` pip package.

In [1]:
# GPU llama-cpp-python; Starting from version llama-cpp-python==0.1.79, it supports GGUF
# !CMAKE_ARGS="-DLLAMA_CUBLAS=off " pip install 'llama-cpp-python>=0.1.79' --force-reinstall --upgrade --no-cache-dir
# For download the models
# !pip install huggingface_hub
# !pip install datasets

We start by downloading an instruction-finetuned Mistral model.

In [2]:
from huggingface_hub import hf_hub_download

model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
model_basename = "mistral-7b-instruct-v0.2.Q6_K.gguf"
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

# This config has been tested on an RTX 3080 (VRAM of 16GB).
# you might need to tweak with respect to your hardware.
from llama_cpp import Llama
lcpp_llm = Llama(
    model_path=model_path,
    n_threads=4, # CPU cores
    n_batch=8000, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    # n_gpu_layers=32, # Change this value based on your model and your GPU VRAM pool.
    n_ctx=8192, # Context window
    logits_all=True
)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /home/mickus/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.2-GGUF/snapshots/3a6fbf4a41a1d52e415a4958cde6856d34b2db93/mistral-7b-instruct-v0.2.Q6_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_mode

In [3]:
import tqdm.notebook as tqdm
import json 
import csv

with open('english-with-questions-valid+test.tsv', 'r') as istr:
    reader = csv.reader(istr, delimiter='\t')
    header = next(reader)
    records = [dict(zip(header, row)) for row in reader]


In [None]:
import random

configs = [
    ('k50_p0.90_t0.1', dict(top_k=50, top_p=0.90, temperature=0.1)),
    ('k50_p0.95_t0.1', dict(top_k=50, top_p=0.95, temperature=0.1)),
    ('k50_p0.90_t0.2', dict(top_k=50, top_p=0.90, temperature=0.2)),
    ('k50_p0.95_t0.2', dict(top_k=50, top_p=0.95, temperature=0.2)),
    ('k50_p0.90_t0.3', dict(top_k=50, top_p=0.90, temperature=0.3)),
    ('k50_p0.95_t0.3', dict(top_k=50, top_p=0.95, temperature=0.3)),
    ('k75_p0.90_t0.1', dict(top_k=75, top_p=0.90, temperature=0.1)),
    ('k75_p0.95_t0.1', dict(top_k=75, top_p=0.95, temperature=0.1)),
    ('k75_p0.90_t0.2', dict(top_k=75, top_p=0.90, temperature=0.2)),
    ('k75_p0.95_t0.2', dict(top_k=75, top_p=0.95, temperature=0.2)),
    ('k75_p0.90_t0.3', dict(top_k=75, top_p=0.90, temperature=0.3)),
    ('k75_p0.95_t0.3', dict(top_k=75, top_p=0.95, temperature=0.3)),
]

random.shuffle(configs)

for shorthand, config in tqdm.tqdm(configs, desc='configs'):
    with open(f'mistral-answers-with-logprobs.{shorthand}.jsonl', 'w') as ostr_logprobs, \
    open(f'mistral-answers.{shorthand}.jsonl', 'w') as ostr:
        for record in tqdm.tqdm(records, desc='items'):
            message = record['question']
            prompt = f"[INST] {message} [/INST]"
            if 'alt_question' in record:
                del record['alt_question']
            response = lcpp_llm(
                prompt=prompt,
                logprobs=32_000,
                max_tokens=512,
                **config
            )
            print(
                json.dumps(
                    {
                        **record, 
                        'model_output': response['choices'][0]['text'],
                        'tokens': response['choices'][0]['logprobs']['tokens'],
                        'logprobs': [
                            {k: float(v) for k,v in d.items()} 
                            for d in response['choices'][0]['logprobs']['top_logprobs']
                        ],
                        'lang': 'EN',
                    }
                ), 
                file=ostr_logprobs,
            )
            
            print(
                json.dumps(
                    {
                        **record, 
                        'model_output': response['choices'][0]['text'],
                        'tokens': response['choices'][0]['logprobs']['tokens'],
                        'lang': 'EN',
                    }
                ), 
                file=ostr,
                flush=True,
            )

configs:   0%|          | 0/12 [00:00<?, ?it/s]

items:   0%|          | 0/207 [00:00<?, ?it/s]


llama_print_timings:        load time =    3260.82 ms
llama_print_timings:      sample time =      32.46 ms /    85 runs   (    0.38 ms per token,  2618.45 tokens per second)
llama_print_timings: prompt eval time =    3260.59 ms /    19 tokens (  171.61 ms per token,     5.83 tokens per second)
llama_print_timings:        eval time =   24825.87 ms /    84 runs   (  295.55 ms per token,     3.38 tokens per second)
llama_print_timings:       total time =   28180.98 ms /   103 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    3260.82 ms
llama_print_timings:      sample time =      12.53 ms /    46 runs   (    0.27 ms per token,  3672.65 tokens per second)
llama_print_timings: prompt eval time =    2704.75 ms /    18 tokens (  150.26 ms per token,     6.65 tokens per second)
llama_print_timings:        eval time =   12957.59 ms /    45 runs   (  287.95 ms per token,     3.47 tokens per second)
llama_print_timings:       total time =   15705.08 ms /    63 