Installing dependencies. You might need to tweak the CMAKE_ARGS for the `llama-cpp-python` pip package.

In [1]:
# GPU llama-cpp-python; Starting from version llama-cpp-python==0.1.79, it supports GGUF
# !CMAKE_ARGS="-DLLAMA_CUBLAS=on " pip install 'llama-cpp-python>=0.1.79' --force-reinstall --upgrade --no-cache-dir
# For download the models
# !pip install huggingface_hub
# !pip install datasets

We start by downloading an instruction-finetuned Mistral model.

In [9]:
from huggingface_hub import hf_hub_download

model_name_or_path = "TheBloke/SauerkrautLM-7B-v1-GGUF"
model_basename = "sauerkrautlm-7b-v1.Q4_K_M.gguf"
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

# This config has been tested on an RTX 3080 (VRAM of 16GB).
# you might need to tweak with respect to your hardware.
from llama_cpp import Llama
lcpp_llm = Llama(
    model_path=model_path,
    n_threads=4, # CPU cores
    n_batch=8192, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    n_gpu_layers=400, # Change this value based on your model and your GPU VRAM pool.
    n_ctx=8192, # Context window
    logits_all=True
)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /users/mickusti/.cache/huggingface/hub/models--TheBloke--SauerkrautLM-7B-v1-GGUF/snapshots/5fdfb9e54142127e22f6873c68c3c2d2acee29e0/sauerkrautlm-7b-v1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.r

In [7]:
import tqdm.notebook as tqdm
import json 
import csv

with open('questions-de.tsv', 'r') as istr:
    reader = csv.reader(istr, delimiter='\t')
    header = next(reader)
    records = [dict(zip(header, row)) for row in reader]


In [None]:
import random

configs = [
    ('k50_p0.90_t0.1', dict(top_k=50, top_p=0.90, temperature=0.1)),
    ('k50_p0.95_t0.1', dict(top_k=50, top_p=0.95, temperature=0.1)),
    ('k50_p0.90_t0.2', dict(top_k=50, top_p=0.90, temperature=0.2)),
    ('k50_p0.95_t0.2', dict(top_k=50, top_p=0.95, temperature=0.2)),
    ('k50_p0.90_t0.3', dict(top_k=50, top_p=0.90, temperature=0.3)),
    ('k50_p0.95_t0.3', dict(top_k=50, top_p=0.95, temperature=0.3)),
    ('k75_p0.90_t0.1', dict(top_k=75, top_p=0.90, temperature=0.1)),
    ('k75_p0.95_t0.1', dict(top_k=75, top_p=0.95, temperature=0.1)),
    ('k75_p0.90_t0.2', dict(top_k=75, top_p=0.90, temperature=0.2)),
    ('k75_p0.95_t0.2', dict(top_k=75, top_p=0.95, temperature=0.2)),
    ('k75_p0.90_t0.3', dict(top_k=75, top_p=0.90, temperature=0.3)),
    ('k75_p0.95_t0.3', dict(top_k=75, top_p=0.95, temperature=0.3)),
]

random.shuffle(configs)

for shorthand, config in tqdm.tqdm(configs, desc='configs'):
    with open(f'mistral-answers-with-logprobs.{shorthand}.jsonl', 'w') as ostr_logprobs, \
    open(f'mistral-answers.{shorthand}.jsonl', 'w') as ostr:
        for record in tqdm.tqdm(records, desc='items'):
            message = record['question']
            prompt = f"[INST] {message} [/INST]"
            if 'alt_question' in record:
                del record['alt_question']
            response = lcpp_llm(
                prompt=prompt,
                logprobs=32_000,
                max_tokens=512,
                **config
            )
            print(
                json.dumps(
                    {
                        **record, 
                        'model_output': response['choices'][0]['text'],
                        'tokens': response['choices'][0]['logprobs']['tokens'],
                        'logprobs': [
                            {k: float(v) for k,v in d.items()} 
                            for d in response['choices'][0]['logprobs']['top_logprobs']
                        ],
                        'lang': 'DE',
                    }
                ), 
                file=ostr_logprobs,
            )
            
            print(
                json.dumps(
                    {
                        **record, 
                        'model_output': response['choices'][0]['text'],
                        'tokens': response['choices'][0]['logprobs']['tokens'],
                        'lang': 'EN',
                    }
                ), 
                file=ostr,
                flush=True,
            )

configs:   0%|          | 0/12 [00:00<?, ?it/s]

items:   0%|          | 0/200 [00:00<?, ?it/s]


llama_print_timings:        load time =     199.07 ms
llama_print_timings:      sample time =      11.93 ms /    28 runs   (    0.43 ms per token,  2347.42 tokens per second)
llama_print_timings: prompt eval time =     198.98 ms /    25 tokens (    7.96 ms per token,   125.64 tokens per second)
llama_print_timings:        eval time =     265.59 ms /    27 runs   (    9.84 ms per token,   101.66 tokens per second)
llama_print_timings:       total time =     696.12 ms /    52 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     199.07 ms
llama_print_timings:      sample time =       0.44 ms /     1 runs   (    0.44 ms per token,  2272.73 tokens per second)
llama_print_timings: prompt eval time =      35.23 ms /    25 tokens (    1.41 ms per token,   709.64 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =     160.15 ms /    26 