This notebook is intended to run on google colab.

### Setup

In [1]:
# mount google drive and define paths (do this in first cell because colab makes us click to confirm in a context window...)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
dir_path = "/content/drive/MyDrive/Data_Science_Studies/thesis_colab_notebooks"
data_path = f"{dir_path}/data"
output_path = f"{dir_path}/outputs"

Mounted at /content/drive


In [2]:
# prepare environment
!pip install datasets -q
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.2.72 --force-reinstall --upgrade --no-cache-dir --verbose
!pip install outlines==0.0.36 # use older version to avoid "cannot convert token to bytes" error when creating generator (see https://github.com/outlines-dev/outlines/issues/820)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hUsing pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting llama-cpp-python==0.2.72
  Downloading llama_cpp_python-0.2.72.tar.gz (49.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Running command pip subprocess to install build dependencies
  Using pip 23.1.2 from

In [3]:
import torch
import outlines

import pandas as pd
from datasets import Dataset
import json
import jsonschema
import time
import os

# import stuff from custom LLM_utils module
import sys
sys.path.append('/content/drive/MyDrive/Data_Science_Studies/thesis_colab_notebooks/modules/')
from LLM_utils import output_json_schema_string, format_prompt, extract_json_from_output # needs to be uploaded to colab OR imported from mounted drive OR downloaded from github
output_json_schema = json.loads(output_json_schema_string) # will need this for validation when not using guided generation

### Model Loading

In [4]:
# loading model
from llama_cpp import Llama

############################################################################################################
# TheBloke repo: mistral quants
#repo_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
#filename = "mistral-7b-instruct-v0.2.Q8_0.gguf"


# my repo (finetunes)
repo_id = "JanJacobsen/llama3_8b_instruct_ft_v4_q8_0"
filename = "llama3_8b_instruct_ft_v4_q8_0-unsloth.Q8_0.gguf"

# my repo (llama3 base f16 and quants)
#repo_id = "JanJacobsen/llama3_8b_instruct_q8_0"
#filename = "llama3_8b_instruct_q8_0-unsloth.Q8_0.gguf"


############################################################################################################

llm = Llama.from_pretrained(
    repo_id=repo_id,
    filename=filename,
    device="cuda",
    n_gpu_layers=-1, # offload entire model to gpu
    n_ctx = 4*1024
)
llm.verbose = False
# llama_cpp model to outlines model
model = outlines.models.LlamaCpp(llm)


(…)8b_instruct_ft_v4_q8_0-unsloth.Q8_0.gguf:   0%|          | 0.00/8.54G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from /root/.cache/huggingface/hub/models--JanJacobsen--llama3_8b_instruct_ft_v4_q8_0/snapshots/4b72fbb22eba29d05d31d002086e9f5696e90c3d/./llama3_8b_instruct_ft_v4_q8_0-unsloth.Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = llama3_8b_instruct_ft_v4_q8_0
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama

### Testing generation before main inference loop

In [None]:
if False:
    # load data for testing
    data_file_path = f"{data_path}/VAL_transcript_chunks_nvids45968_chunksize2048_overlap50_tokMistral_with_metadata_for_prompt.csv"

    data = Dataset.from_pandas(pd.read_csv(data_file_path, sep=";"))

    # add prompts
    data = data.map(format_prompt, fn_kwargs={"prompt_format": "llama3",
                                                        "include_answer_tease": True,
                                                        "include_label": False,
                                                        "include_bos": True,
                                                        "include_eos": False})



Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [None]:
if False:
    # check an example prompt
    prompt = data[20]['prompt']
    print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a smart and efficient assistant specialized at extracting relevant information from text and replying in json format. You always follow the user's instructions carefully.<|eot_id|><|start_header_id|>user<|end_header_id|>

The triple-quoted text below is part of a youtube video transcript by channel @EverythingMoney with the title 'Verizon Stock Analysis | Top Stocks to Buy Now? | VZ Stock Price'. The top tags for the video are: 'everything money, investing, investing in your 20s'. Read the transcript carefully in order to perform the asset name extraction task specified below the transcript.

"""to 2020 we're spending that additional 18 billion annual catback spending okay so let's look at this look at verizon wireless capital expenditures for 2021. so again guys understand the 63 billion dollar number because it's a big jump but we read reports that for the year they're supposed to do 17 to 18 billion in capital expe

In [None]:
if False: # test 10 generations (model directly)
    prompt = data[22]['prompt']
    temp = 0.01
    for i in range(10):
        result = llm.create_completion(prompt, temperature=temp, top_k=50, max_tokens=1000)
        print(result['choices'][0]['text'])

[{"asset_name": "Palantir", "asset_type": "stock", "sentiment": "buy"}, {"asset_name": "Ethereum", "asset_type": "crypto", "sentiment": "buy"}]
[{"asset_name": "Palantir", "asset_type": "stock", "sentiment": "buy"}, {"asset_name": "Ethereum", "asset_type": "crypto", "sentiment": "buy"}]
[{"asset_name": "Palantir", "asset_type": "stock", "sentiment": "buy"}, {"asset_name": "Ethereum", "asset_type": "crypto", "sentiment": "buy"}]
[{"asset_name": "Palantir", "asset_type": "stock", "sentiment": "buy"}, {"asset_name": "Ethereum", "asset_type": "crypto", "sentiment": "buy"}]
[{"asset_name": "Palantir", "asset_type": "stock", "sentiment": "buy"}, {"asset_name": "Ethereum", "asset_type": "crypto", "sentiment": "buy"}]
[{"asset_name": "Palantir", "asset_type": "stock", "sentiment": "buy"}]
[{"asset_name": "Palantir", "asset_type": "stock", "sentiment": "buy"}, {"asset_name": "Ethereum", "asset_type": "crypto", "sentiment": "buy"}]
[{"asset_name": "Palantir", "asset_type": "stock", "sentiment": 

In [None]:
if False: # test generator creation
    # create outlines generator with custom json schema constraints
    generator = outlines.generate.json(model, output_json_schema_string, whitespace_pattern=r"[ \n\t]?")

In [None]:
if False:
    # test example prompt
    result = generator(prompt, temperature=0.01, top_k=50, max_tokens=1000)
    result = json.dumps(result)
    print(result)

In [None]:
if False: # test generator in loop (do we have to recreate each time?)
    prompt = data[22]['prompt']
    test_recreation = False
    for i in range(10):
        if test_recreation:
            generator = outlines.generate.json(model, output_json_schema_string, whitespace_pattern=r"[ \n\t]?")
        result = generator(prompt, temperature=1.5, top_k=50, max_tokens=1000)
        result = json.dumps(result)
        print(result)

### Prepare Data & Inference

In [5]:
##########################################################################################################################################################
# ------------------- run name (adjust for every new run!) ------------------ #
run_type = "val" # "val" (= test) or "inf"
run_model = "llama3_ft_v4" # mistral/llama3/... + _ft_version if finetuned
run_model_quant = "q8_0"
run_framework = "llamacpp" # llamacpp/vllm/transformers/...
additional_suffix = "_unguided"
# --------------------------------------------------------------------------- #
run_name = f"{run_type}_{run_model}_{run_model_quant}_{run_framework}{additional_suffix}"
data_file_path = f"{data_path}/{run_type.upper()}_transcript_chunks_nvids45968_chunksize2048_overlap50_tokMistral_with_metadata_for_prompt.csv"
progress_file_path = f"{output_path}/{run_name}.csv" # file does not need to exist yet

# prompt formatting arguments (for our custom function)
prompt_format_kwargs={
    "prompt_format": "llama3",
    "include_bos": True, # does llamacpp add it or not?
    "include_answer_tease": True,
    "include_label": False, # never for inference, only used for finetuning
    "include_eos": False}

# model parameters (might be model-specific)
temperature = 0.01 # very low / zero for our task
top_k = 50 # relatively high value to avoid problems with guided generation (i.e. what if token required by schema is not in top k?)
seed = 42 # relevant for sampling (CAUTION: can't be used with llamacpp & outlines==0.0.36)
max_tokens = 1250 # might have to be set for llamacpp to override default? unsure...

# how are we calling the model? with guided generation (outlines) active, or directly via llama_cpp?
guided_generation = False

# outlines-specific
apply_reset_generator_fix = False # in some configurations with outlines & llamacpp the outlines.generator.json() needs to be reset before each new call
generator_whitespace_pattern = r"[ \n\t]?" # should we allow whitespaces/newlines/etc. being generated within the json structure? probably yes, but limit to zero or one at a time
def get_generator(whitespace_pattern): # to make sure we call it with the same parameters everywhere (e.g. whitespace pattern)
    return outlines.generate.json(model, output_json_schema_string, whitespace_pattern=whitespace_pattern)

# other run parameters
skip_previous_errors = True # skip examples with True in 'error?' column?
replace_previous_errors = True # only relevant if skip_previous_errors is False: should previous errors be replaced with new outputs (if available) in progress file?
save_interval = 50 # update progress file every n examples (note: should not be set too low because saving to and loading from drive might not actually update files instantly? note sure...)
print_progress_interval = 10 # print progress every n examples
print_errors = True # useful to catch issues, but disable for final inference runs with confirmed problem-free models/code (to avoid crashes due to huge cell output)

##########################################################################################################################################################

# check if files exist
import os
if not os.path.exists(data_file_path):
    raise ValueError(f"Data file {data_file_path} does not exist!")
if not os.path.exists(progress_file_path):
    print(f"Progress file {progress_file_path} does not exist, creating new one.")
    pd.DataFrame(columns=["video_id", "chunk_number", "output", "error?"]).to_csv(progress_file_path, sep=";", index=False)

# load data
data = pd.read_csv(data_file_path, sep=";")
print(f"Loaded dataset with {len(data)} samples from {data_file_path}")

# load progress data
progress = pd.read_csv(progress_file_path, sep=";")
print(f"Loaded progress file with {len(progress)} processed examples from {progress_file_path}")

# merge progress data with main data (-> adds 'output' and 'error?' columns from progress df)
data = pd.merge(data, progress, on=["video_id", "chunk_number"], how="left")

# filter out already processed examples (and optionally previous examples which caused errors)
data = data[data['output'].isnull()]
if skip_previous_errors:
    data = data[data['error?'] != True] # None and False should both stay in, only filter out True
print(f"Examples yet to process in dataset: {len(data)}")

# convert to hf dataset
data = Dataset.from_pandas(data)

# add prompts
data = data.map(format_prompt, fn_kwargs=prompt_format_kwargs)

# create outlines generator (and check time)
if guided_generation:
    start_time = time.time()
    generator = get_generator(generator_whitespace_pattern)
    print(f"Created outlines generator in {round(time.time() - start_time, 2)} seconds.")

print(f"Preparation complete for run: {run_name}")

Progress file /content/drive/MyDrive/Data_Science_Studies/thesis_colab_notebooks/outputs/val_llama3_ft_v4_q8_0_llamacpp_unguided.csv does not exist, creating new one.
Loaded dataset with 150 samples from /content/drive/MyDrive/Data_Science_Studies/thesis_colab_notebooks/data/VAL_transcript_chunks_nvids45968_chunksize2048_overlap50_tokMistral_with_metadata_for_prompt.csv
Loaded progress file with 0 processed examples from /content/drive/MyDrive/Data_Science_Studies/thesis_colab_notebooks/outputs/val_llama3_ft_v4_q8_0_llamacpp_unguided.csv
Examples yet to process in dataset: 150


Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Preparation complete for run: val_llama3_ft_v4_q8_0_llamacpp_unguided


### Inference Loop

In [6]:
# <- STARTS ITERATION

# helper function to print elapsed time
def get_elapsed_time_str(start_time):
    hours, rem = divmod(time.time() - start_time, 3600)
    minutes, seconds = divmod(rem, 60)
    return f"{int(hours)} h, {int(minutes)} min, {round(seconds, 2)} sec"

# helper function for saving progress
def update_progress_file(new_results, progress_file_path, replace_previous_errors):
#    1. convert to pandas df
    new_results_df = pd.DataFrame(new_results, columns=["video_id", "chunk_number", "output", "error?"])
    # 2. load already processed data and append new results (if it exists)
    if os.path.exists(progress_file_path):
        progress_df = pd.read_csv(progress_file_path, sep=";")
        progress_df = pd.concat([progress_df, new_results_df], ignore_index=True)
        # drop duplicate error rows (duplicates here should only happen for previous error rows which we included and got errors again)
        progress_df = progress_df.drop_duplicates(subset=["video_id", "chunk_number", "error?"], keep="last")
    else:
        progress_df = new_results_df

    #3. optionally, if there are new results for examples which previously caused errors, replace the error rows (otherwise: keep both the error row and the new result row in the file)
    if replace_previous_errors:
        # note: since we used pd.concat above we know that new results are appended at the end of the df, which allows us to use keep="last" here to keep the new result row (without sorting first)
        progress_df = progress_df.drop_duplicates(subset=["video_id", "chunk_number"], keep="last")
    # 3. save
    progress_df.to_csv(progress_file_path, sep=";", index=False)
    print(f"  - Saved {len(new_results)} new results to {progress_file_path}")

print(f"*** STARTING INFERENCE FOR {len(data)} EXAMPLES (run name: {run_name}) ***\n{'-'*60}")

# inference loop
new_results = []
start_time = time.time()
for i, example in enumerate(data):

    video_id = example['video_id']
    uploader_id = example['uploader_id']
    chunk_number = example['chunk_number']

    # model call
    try:
        if guided_generation:
            if apply_reset_generator_fix:
                generator = get_generator(generator_whitespace_pattern)
            # outlines model call -> dump json to string
            result = generator(example['prompt'], temperature=temperature, top_k=top_k, max_tokens=max_tokens)
            result = json.dumps(result)

        else:
            # llamacpp model call -> try to extract and validate the json (the model output might contain additional text or invalid json)
            result = llm(example['prompt'], temperature=temperature, top_k=top_k, max_tokens=max_tokens)['choices'][0]['text']
            result = extract_json_from_output(result) # returns string or raises error
            # validate
            result = json.loads(result)
            jsonschema.validate(instance=result, schema=output_json_schema) # raises error if instance doesn't match schema
            result = json.dumps(result) # we convert back to string to ensure clean structure (no newlines etc.)
        # no error: append result
        new_results.append((video_id, chunk_number, result, False))

    except Exception as e:

        if print_errors:
            print(f"Error at iteration {i} (video_id: {video_id}, chunk_number: {chunk_number}) - {e} ")

        new_results.append((video_id, chunk_number, None, True))

    # save progress
    if (i+1) % save_interval == 0:
        update_progress_file(new_results, progress_file_path, replace_previous_errors)
        new_results = []



    # print progress
    if (i+1) % print_progress_interval == 0:
        print(f"Processed {i+1} total examples in {get_elapsed_time_str(start_time)}")


# save remaining examples
update_progress_file(new_results, progress_file_path, replace_previous_errors)

print(f"{'-'*60}\nFinished processing {len(data)} examples in {get_elapsed_time_str(start_time)}")



*** STARTING INFERENCE FOR 150 EXAMPLES (run name: val_llama3_ft_v4_q8_0_llamacpp_unguided) ***
------------------------------------------------------------
Processed 10 total examples in 0 h, 0 min, 25.2 sec
Processed 20 total examples in 0 h, 0 min, 48.63 sec
Processed 30 total examples in 0 h, 1 min, 26.28 sec
Processed 40 total examples in 0 h, 2 min, 9.84 sec
  - Saved 50 new results to /content/drive/MyDrive/Data_Science_Studies/thesis_colab_notebooks/outputs/val_llama3_ft_v4_q8_0_llamacpp_unguided.csv
Processed 50 total examples in 0 h, 2 min, 33.9 sec
Processed 60 total examples in 0 h, 2 min, 54.39 sec
Processed 70 total examples in 0 h, 3 min, 34.84 sec
Processed 80 total examples in 0 h, 4 min, 8.51 sec
Processed 90 total examples in 0 h, 4 min, 47.67 sec
  - Saved 50 new results to /content/drive/MyDrive/Data_Science_Studies/thesis_colab_notebooks/outputs/val_llama3_ft_v4_q8_0_llamacpp_unguided.csv
Processed 100 total examples in 0 h, 5 min, 9.44 sec
Processed 110 total exa