In [1]:
import psutil

# Function to get memory usage in MB
def get_memory_usage():
    process = psutil.Process()
    memory_usage = process.memory_info().rss / 1024 / 1024  # in MB
    return memory_usage


In [3]:
import time
from llama_cpp import Llama
import sacrebleu
from nltk.tokenize import word_tokenize

# Initialize the Llama model with the correct path to your model
llm = Llama(
    model_path=r"C:\Users\acer\Documents\zephyr-7b-beta.Q4_K_M.gguf"
    # You can uncomment and adjust parameters like n_gpu_layers, seed, or n_ctx here if needed
)

# Define your prompt for evaluation
prompt = "Q: tell me a story about a brave knight A: "

# Measure memory usage before inference
memory_before = get_memory_usage()

# Start measuring time
start_time = time.time()
# Generate model output
output = llm(
    prompt=prompt,
    max_tokens=100,  # Generate up to 100 tokens
    stop=["Q:", "\n"],  # Stop generating before the model generates a new question or newline
    echo=True,  # Echo the prompt back in the output
    temperature=0.7,  # Adjust temperature for randomness (0.7 is moderately random)
    top_p=0.9  # Adjust top_p for diversity (0.9 means considering the top 90% of probability mass)
)
# End measuring time
end_time = time.time()
# Measure memory usage after inference
memory_after = get_memory_usage()
# Calculate inference time
inference_time = end_time - start_time
memory_used = memory_after - memory_before
# Extract generated text from the output
generated_output = output['choices'][0]['text']

# Reference text (ground truth) for evaluation
reference = "A brave knight fought valiantly against the dragon."

# Tokenize generated output and reference
generated_tokens = word_tokenize(generated_output)
reference_tokens = word_tokenize(reference)

# Compute BLEU score using SacreBLEU
bleu = sacrebleu.corpus_bleu([generated_output], [[reference]])

# Print the generated output and BLEU score
print(f"Generated Output:\n{generated_output}\n")
print(f"Reference:\n{reference}\n")
print(f"BLEU Score: {bleu.score}")
print(f"Inference Time: {inference_time} seconds")
# Print memory usage
print(f"Memory Usage Before Inference: {memory_before:.2f} MB")
print(f"Memory Usage After Inference: {memory_after:.2f} MB")
print(f"Memory Used: {memory_used:.2f} MB")

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from C:\Users\acer\Documents\zephyr-7b-beta.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = huggingfaceh4_zephyr-7b-beta
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:          

Generated Output:
Q: tell me a story about a brave knight A:  Once upon a time, in a land far away, there was a brave knight named Sir Alexander. He was tall and strong with a heart full of courage and determination. His kingdom was threatened by an evil sorcerer who had cast a dark spell on the land, turning the fields barren and the skies gray. The people were suffering and Sir Alexander knew he had to act. With his trusty steed by his side, he set off on a dangerous quest to defeat the sor

Reference:
A brave knight fought valiantly against the dragon.

BLEU Score: 0.859122307088785
Inference Time: 36.90883660316467 seconds
Memory Usage Before Inference: 273.73 MB
Memory Usage After Inference: 4385.82 MB
Memory Used: 4112.09 MB


In [5]:
from rouge import Rouge

rouge = Rouge()

# Convert tokens back to strings for ROUGE evaluation
generated_text = ' '.join(generated_tokens)
reference_text = ' '.join(reference_tokens)

# Compute ROUGE scores
rouge_scores = rouge.get_scores(generated_text, reference_text)

# Print ROUGE scores
print(f"ROUGE Scores:")
for metric, scores in rouge_scores[0].items():
    print(f"{metric}: {scores['f']}")


ROUGE Scores:
rouge-1: 0.10126582096458903
rouge-2: 0.01923076797522198
rouge-l: 0.10126582096458903


In [7]:
import os

# Path to your Llama model
model_path = r"C:\Users\acer\Documents\zephyr-7b-beta.Q4_K_M.gguf"

# Function to get size of a file or directory
def get_model_size(path):
    # Check if path is a file
    if os.path.isfile(path):
        return os.path.getsize(path) / (1024 * 1024)  # in MB
    # Check if path is a directory
    elif os.path.isdir(path):
        total_size = 0
        for dirpath, _, filenames in os.walk(path):
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                total_size += os.path.getsize(filepath)
        return total_size / (1024 * 1024)  # in MB
    else:
        raise ValueError(f"Path '{path}' is not a valid file or directory.")

# Get size of the Llama model
model_size = get_model_size(model_path)

# Print model size
print(f"Model Size: {model_size:.2f} MB")


Model Size: 4166.07 MB


In [33]:
!python Documents/llama.cpp/convert_hf_to_gguf.py ./documents/saved_model --outfile llama2_7b_chat.gguf --outtype f16

INFO:hf-to-gguf:Loading model: saved_model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Set model parameters
INFO:hf-to-gguf:gguf: context length = 4096
INFO:hf-to-gguf:gguf: embedding length = 4096
INFO:hf-to-gguf:gguf: feed forward length = 11008
INFO:hf-to-gguf:gguf: head count = 32
INFO:hf-to-gguf:gguf: key-value head count = 32
INFO:hf-to-gguf:gguf: rope theta = 10000.0
INFO:hf-to-gguf:gguf: rms norm epsilon = 1e-05
INFO:hf-to-gguf:gguf: file type = 1
INFO:hf-to-gguf:Set model tokenizer
INFO:gguf.vocab:Setting special token type bos to 1
INFO:gguf.vocab:Setting special token type eos to 2
INFO:gguf.vocab:Setting special token type unk to 0
INFO:gguf.vocab:Setting add_bos_token to True
INFO:gguf.vocab:Setting add_eos_token to False
INFO:gguf.vocab:Setting chat_template to {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{%

In [37]:
import time
from llama_cpp import Llama
import sacrebleu
from nltk.tokenize import word_tokenize

# Initialize the Llama model with the correct path to your model
llm = Llama(
    model_path=r"C:\Users\acer\Documents\llama2_7b_chat.gguf"
    # You can uncomment and adjust parameters like n_gpu_layers, seed, or n_ctx here if needed
)

# Define your prompt for evaluation
prompt = "Q: tell me a story about a brave knight A: "

# Measure memory usage before inference
memory_before = get_memory_usage()

# Start measuring time
start_time = time.time()
# Generate model output
output = llm(
    prompt=prompt,
    max_tokens=100,  # Generate up to 100 tokens
    stop=["Q:", "\n"],  # Stop generating before the model generates a new question or newline
    echo=True,  # Echo the prompt back in the output
    temperature=0.7,  # Adjust temperature for randomness (0.7 is moderately random)
    top_p=0.9  # Adjust top_p for diversity (0.9 means considering the top 90% of probability mass)
)
# End measuring time
end_time = time.time()
# Measure memory usage after inference
memory_after = get_memory_usage()
# Calculate inference time
inference_time = end_time - start_time
memory_used = memory_after - memory_before
# Extract generated text from the output
generated_output = output['choices'][0]['text']

# Reference text (ground truth) for evaluation
reference = "A brave knight fought valiantly against the dragon."

# Tokenize generated output and reference
generated_tokens = word_tokenize(generated_output)
reference_tokens = word_tokenize(reference)

# Compute BLEU score using SacreBLEU
bleu = sacrebleu.corpus_bleu([generated_output], [[reference]])

# Print the generated output and BLEU score
print(f"Generated Output:\n{generated_output}\n")
print(f"Reference:\n{reference}\n")
print(f"BLEU Score: {bleu.score}")
print(f"Inference Time: {inference_time} seconds")
# Print memory usage
print(f"Memory Usage Before Inference: {memory_before:.2f} MB")
print(f"Memory Usage After Inference: {memory_after:.2f} MB")
print(f"Memory Used: {memory_used:.2f} MB")

llama_model_loader: loaded meta data with 26 key-value pairs and 291 tensors from C:\Users\acer\Documents\llama2_7b_chat.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = saved_model
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 4096
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   7:              llama.attention.head_c

Generated Output:
Q: tell me a story about a brave knight A:  Sure! Here is a story about a brave knight named Sir Edward:

Reference:
A brave knight fought valiantly against the dragon.

BLEU Score: 3.197383344450448
Inference Time: 17.37819766998291 seconds
Memory Usage Before Inference: 538.19 MB
Memory Usage After Inference: 13146.93 MB
Memory Used: 12608.74 MB
