In [23]:
import psutil
import json
from tqdm import tqdm
import urllib.request

In [24]:
def check_if_running(process_name):
    running = False
    for proc in psutil.process_iter(["name"]):
        if process_name in proc.info["name"]:
            running = True
            break
    return running

ollama_running = check_if_running("ollama")

if not ollama_running:
    raise RuntimeError("Ollama not running. Launch ollama before proceeding.")
print("Ollama running:", check_if_running("ollama"))

Ollama running: True


In [25]:
file_path = "instruction-data-with-response.json"

with open(file_path, "r") as file:
    test_data = json.load(file)


def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""

    return instruction_text + input_text

In [26]:
def query_model(prompt, model="llama3.2", url="http://localhost:11434/api/chat"):
    data = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "options": {   
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048
        }
    }

    payload = json.dumps(data).encode("utf-8")
    
    request = urllib.request.Request(url, data=payload, method="POST")
    request.add_header("Content-Type", "application/json")

    response_data = ""
    with urllib.request.urlopen(request) as response:
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]

    return response_data


model = "llama3.2"
result = query_model("what is your favorite ice cream flavor?", model)
print(result)

I don't have personal preferences or taste buds, but I can suggest some popular ice cream flavors that people enjoy:

1. Vanilla
2. Chocolate
3. Strawberry
4. Cookies and Cream
5. Mint Chocolate Chip

What's your favorite ice cream flavor?


In [27]:
for entry in test_data[:3]:
    prompt = (
        f"Given the input `{format_input(entry)}` "
        f"and correct output `{entry['output']}`, "
        f"score the model response `{entry['model_response']}`"
        f" on a scale from 0 to 100, where 100 is the best score. "
    )
    print("\nDataset response:")
    print(">>", entry['output'])
    print("\nModel response:")
    print(">>", entry["model_response"])
    print("\nScore:")
    print(">>", query_model(prompt))
    print("\n-------------------------")


Dataset response:
>> The car is as fast as lightning.

Model response:
>> The car is as fast as a cheetah.

Score:
>> I would rate the model response "The car is as fast as a cheetah." a score of 80 out of 100.

Here's why:

* The response uses a simile, which is a common literary device used to compare two things.
* The comparison is between the speed of the car and that of a cheetah, which is known for its incredible speed.
* However, the model response could be improved by making the comparison more vivid and evocative. For example, "The car is as fast as lightning" creates a stronger image in the reader's mind.

Overall, while the model response is not bad, it could benefit from a bit more creativity and precision to make it truly effective.

-------------------------

Dataset response:
>> The type of cloud typically associated with thunderstorms is cumulonimbus.

Model response:
>> The type of cloud associated with thunderstorms is a cumulus cloud.

Score:
>> I would rate the mod

In [28]:
for entry in test_data[:3]:
    prompt = (
        f"You are an evaluation model."
        f"You will be given three inputs:"
        f"1. The original prompt."
        f"2. The correct response."
        f"3. The generated response."
        f"Your task is to compare the generated response with the correct response and assign a rating from 1 to 5, based on how accurate and close the generated response is to the correct response."
        f"Scoring guidelines:"
        f"5 = Perfectly accurate or nearly identical"
        f"4 = Mostly correct, minor issues"
        f"3 = Partially correct, significant issues"
        f"2 = Mostly incorrect, small resemblance"
        f"1 = Completely wrong or irrelevant"
        f"Important rules:"
        f"- Respond with ONLY the integer rating (1, 2, 3, 4, or 5)."
        f"- Do NOT provide explanations, reasoning, or extra words."
        f"original prompt: {format_input(entry)}"
        f"correct response: {entry['output']}"
        f"generated response: {entry['model_response']}"
    )
    
    print("\nDataset response:")
    print(">>", entry['output'])
    print("\nModel response:")
    print(">>", entry["model_response"])
    print("\nScore:")
    print(">>", query_model(prompt))
    print("\n-------------------------")


Dataset response:
>> The car is as fast as lightning.

Model response:
>> The car is as fast as a cheetah.

Score:
>> 4

-------------------------

Dataset response:
>> The type of cloud typically associated with thunderstorms is cumulonimbus.

Model response:
>> The type of cloud associated with thunderstorms is a cumulus cloud.

Score:
>> 3

-------------------------

Dataset response:
>> Jane Austen.

Model response:
>> The author of 'Pride and Prejudice' is Jane Austen.

Score:
>> 5

-------------------------


In [30]:
def generate_model_scores(json_data, json_key, model="llama3.2"):
    scores = []
    for entry in tqdm(json_data, desc="Scoring entries"):
        prompt = (
        f"You are an evaluation model."
        f"You will be given three inputs:"
        f"1. The original prompt."
        f"2. The correct response."
        f"3. The generated response."
        f"Your task is to compare the generated response with the correct response and assign a rating from 1 to 5, based on how accurate and close the generated response is to the correct response."
        f"Scoring guidelines:"
        f"5 = Perfectly accurate or nearly identical"
        f"4 = Mostly correct, minor issues"
        f"3 = Partially correct, significant issues"
        f"2 = Mostly incorrect, small resemblance"
        f"1 = Completely wrong or irrelevant"
        f"Important rules:"
        f"- Respond with ONLY the integer rating (1, 2, 3, 4, or 5)."
        f"- Do NOT provide explanations, reasoning, or extra words."
        f"original prompt: {format_input(entry)}"
        f"correct response: {entry['output']}"
        f"generated response: {entry[json_key]}"
        )
        score = query_model(prompt, model)
        try:
            scores.append(int(score))
        except ValueError:
            print(f"Could not convert score: {score}")
            continue

    return scores


scores = generate_model_scores(test_data, "model_response")
print(f"Number of scores: {len(scores)} of {len(test_data)}")
print(f"Average score: {sum(scores)/len(scores):.2f}\n")

Scoring entries: 100%|█████████████████████████████████████████████████████████████| 110/110 [04:43<00:00,  2.58s/it]

Number of scores: 110 of 110
Average score: 3.54




