*  Using microsoft guidance library to force desired output format
*  Using TheBloke C++ quantized LLAMA2 version to reduce RAM & vRAM requirements



In [1]:
import sys
import os
import re
import pandas as pd
import requests
from tqdm.notebook import tqdm
import numpy as np
from scipy.stats import spearmanr


def install_llama_cpp():

    print("Installing llama-cpp-python...")
    try:
        !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
    except:
        try:
            !pip install llama-cpp-python
        except:
            raise Exception("Could not install llama-cpp-python. Do you have C++ build tools installed?")

    print("Installing guidance...")
    try:
        !pip install guidance
    except:
        raise Exception("Could not install guidance")

    print("Installing completed")
    return None

install_llama_cpp()
import guidance
from guidance import models, gen, system, user, assistant

def download_file_with_progress(url, save_path, filename):
    """
    Download a file with progress indicator from a given URL

    :param url: URL to the file
    :param save_path: Path where the file will be saved
    :param filename: Filename to save the downloaded content
    """
    # Check if file already exists
    full_path = os.path.join(save_path, filename)
    if os.path.exists(full_path):
        print(f"The file {filename} already exists in {save_path}. Download skipped.")
        return

    response = requests.get(url, stream=True)
    total_size_in_bytes = int(response.headers.get('content-length', 0))
    block_size = 1024 * 1024 * 100 # 100 megabytes chunks
    progress_bar_size = 50

    print(f"Starting download of {filename}")
    print(f"Total download size: {total_size_in_bytes / (1024 * 1024):.2f} MB")

    with open(full_path, 'wb') as file:
        downloaded_size = 0
        for data in response.iter_content(block_size):
            downloaded_size += len(data)
            file.write(data)
            done = int(progress_bar_size * downloaded_size / total_size_in_bytes)
            print(f"\r[{'█' * done}{'.' * (progress_bar_size - done)}] {downloaded_size * 100 / total_size_in_bytes:.2f}%", end='')
    print("\nDownload completed.")

class LLAMA:
    def __init__(self, model_name) -> None:
        self.model_name = model_name

        self.initialize_model(self.model_name)

        return None

    def initialize_model(self, model_name):
        url_dict = {"llama-2-7b-chat.Q2_K.gguf": "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf?download=true",
                    "llama-2-7b-chat.Q3_K_L.gguf": "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q3_K_L.gguf?download=true",
                    "llama-2-7b-chat.Q6_K.gguf": "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q6_K.gguf?download=true",
                    "llama-2-13b-chat.Q3_K_M.gguf": "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q3_K_M.gguf?download=true",
                    "llama-2-13b-chat.Q5_K_M.gguf": "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q5_K_M.gguf?download=true",
                    "llama-2-13b-chat.Q6_K.gguf": "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q6_K.gguf?download=true",
                    "llama-2-70b-chat.Q5_K_M.gguf": "https://huggingface.co/TheBloke/Llama-2-70B-Chat-GGUF/resolve/main/llama-2-70b-chat.Q5_K_M.gguf?download=true"}

        if model_name not in url_dict.keys():
            raise Exception("Invalid model name. Valid model names are: " + ", ".join(url_dict.keys()))

        gguf_url = url_dict[model_name]
        gguf_filename = model_name

        if 'google.colab' in sys.modules:
            save_path = ""
        else:
            save_path = os.path.join("..", "data", "pretrained_models")

        # Download the .gguf file with progress
        download_file_with_progress(gguf_url, save_path, gguf_filename)

        self.model = models.LlamaCpp(os.path.join(save_path, gguf_filename), n_gpu_layers=-1, n_ctx=4096)

        return None

    def inference(self, sentence_pair, query = None):

        # Limit the output to floats 0-1
        regex_pattern = r"0(\.\d+)?|1(\.0+)?"

        if query is None:
            query = f"""How semantically related are those two sentences on scale from 0 (least related) to 1 (most related): {sentence_pair}"""

        output = self.model + f'''\
                Q: {query}
                A: {gen('relatedness', regex=regex_pattern)}'''

        return output

# DATA
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    PATH = os.path.join("drive", "MyDrive", "LMU", "AppliedDL", "data", "raw")
else:
    PATH = os.path.join("..", "data", "raw")

def get_data(subset):

    df_train = pd.read_csv(os.path.join(PATH, 'eng_train.csv'))
    df_train["Split_Text"] = df_train["Text"].apply(lambda x: x.replace("\n", " "))
    df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: x.split("\r"))
    df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: [re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in x])

    df_train["sen_1"] = df_train["Split_Text"].apply(lambda x: x[0])
    df_train["sen_2"] = df_train["Split_Text"].apply(lambda x: x[1])
    df_train.drop(["Split_Text"], axis=1, inplace=True)
    display(df_train.head())

    if subset is not None:
        df_train = df_train.sample(n=subset, random_state=42)

    return df_train

Installing llama-cpp-python...
Installing guidance...


'CMAKE_ARGS' is not recognized as an internal or external command,
operable program or batch file.


Installing completed


### Initialize the model (smallest to biggest)
- <12GB of RAM/vRAM
    - llama-2-7b-chat.Q2_K.gguf
    - llama-2-7b-chat.Q3_K_L.gguf
    - llama-2-7b-chat.Q5_K_M.gguf
- <24GB of RAM/vRAM
    - llama-2-13b-chat.Q3_K_M.gguf
    - llama-2-13b-chat.Q5_K_M.gguf
    - llama-2-13b-chat.Q6_K.gguf
- <64GB of RAM/vRAM
    - llama-2-70b-chat.Q5_K_M.gguf

In [2]:
# Initialize the model
LLAMA2 = LLAMA("llama-2-13b-chat.Q6_K.gguf")
# BLAS = 1 means there is GPU acceleration

output = LLAMA2.inference("[I like to eat apples. I like to eat oranges.]")
output["relatedness"]

'0.5'

Get spearman on df_train

In [3]:
# Iterate through the whole train df ~ 1h
df = get_data(subset=200)

results = []
for i in tqdm(range(len(df))):
    sentence_pair = f'[{df.iloc[i, 3]}, {df.iloc[i, 4]}]'
    score = df.iloc[i, 2]

    output = LLAMA2.inference(sentence_pair)

    # Append the results to the list
    results.append((score, float(output["relatedness"])))


results = pd.DataFrame(results, columns = ["Score", "Prediction"])
correlation, p_value = spearmanr(results["Score"], results["Prediction"])
print("Spearman Correlation Coefficient:", np.round(correlation, 2))

Spearman Correlation Coefficient: 0.29
