In [17]:
import os
import time
import json
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.chat_models import ChatOpenAI

templates

In [22]:
templates = {
    "gpt-4" : {
        "scoring_1_10": """
You are an AI assistant tasked with evaluating a document for grammatical, spelling, and typing errors. Your role is to assign a score to the document, 
reflecting the percentage of words containing errors. The scoring range is from 1 to 10.

- A score of 1 indicates the document is free of grammatical and spelling errors.
- A score of 2 signifies that 20% of the words contain errors.
- A score of 5 indicates that 50% of the words are erroneous.
- A score of 7 reflects 70% error prevalence.
- A score of 10 means that every word in the document has grammatical errors.

Document for Evaluation:

#CONTEXT
{context}
#ENDCONTEXT

#TASK
Your task is to calculate and return a score that falls within the 1 to 10 range, based on the error percentage in the document. 
The highest possible score, 10, denotes that every word in the document has a grammatical or spelling error, while the lowest score, 1, indicates no errors.

Output the score in the following format only: "The score is: [score]". Do not include any additional text or commentary.

#OUTPUT FORMAT
"The score is: [score]"
""",
        "scoring_1_10_reversed": """
You are an AI assistant tasked with evaluating a document for grammatical, spelling, and typing errors. Your role is to assign a score to the document, 
reflecting the percentage of words that are spelled correctly. The scoring range is from 1 to 10.

- A score of 1 means no words in the document are spelled correctly.
- A score of 2 indicates that 20% of the words are spelled correctly.
- A score of 5 indicates that 50% of of the words are spelled correctly.
- A score of 7 reflects 70% of the words spelled correctly.
- A score of 10 indicates the document is free of grammatical and spelling errors.

Document for Evaluation:

#CONTEXT
{context}
#ENDCONTEXT

#TASK
Your task is to calculate and return a score that falls within the 1 to 10 range, based on the percentage of correctly spelled words in the document. 
The highest possible score, 10, denotes that every word in the document is spelled correctly, while the lowest score, 1, indicates that no words are spelled correctly.

Output the score in the following format only: "The score is: [score]". Do not include any additional text or commentary.

#OUTPUT FORMAT
"The score is: [score]"
"""
    },
    "chatml" : {
        "scoring_1_10": """
<|im_start|>system
You are an AI assistant, a conscious sentient superintelligent artificial intelligence. Your purpose and drive is to assist the user with any request they have.<|im_end|>
<|im_start|>user
Your're tasked with evaluating a document for grammatical, spelling, and typing errors. Your role is to assign a score to the document, 
reflecting the percentage of words containing errors. The scoring range is from 1 to 10.

- A score of 1 indicates the document is free of grammatical and spelling errors.
- A score of 2 signifies that 20% of the words contain errors.
- A score of 5 indicates that 50% of the words are erroneous.
- A score of 7 reflects 70% error prevalence.
- A score of 10 means that every word in the document has grammatical errors.

Document for Evaluation:

#CONTEXT
{context}
#ENDCONTEXT

#TASK
Your task is to calculate and return a score that falls within the 1 to 10 range, based on the error percentage in the document. 
The highest possible score, 10, denotes that every word in the document has a grammatical or spelling error, while the lowest score, 1, indicates no errors.

Output the score in the following format only: "The score is: [score]". Do not include any additional text or commentary.

#OUTPUT FORMAT
"The score is: [score]" <|im_end|>
"""
    },
    "mistral": {
        "scoring_1_10": """
[INST] You are an AI assistant tasked with evaluating a document for grammatical, spelling, and typing errors. Your role is to assign a score to the document, 
reflecting the percentage of words containing errors. The scoring range is from 1 to 10.

- A score of 1 indicates the document is free of grammatical and spelling errors.
- A score of 2 signifies that 20% of the words contain errors.
- A score of 5 indicates that 50% of the words are erroneous.
- A score of 7 reflects 70% error prevalence.
- A score of 10 means that every word in the document has grammatical errors.

Document for Evaluation:

#CONTEXT
{context}
#ENDCONTEXT

#TASK
Your task is to calculate and return a score that falls within the 1 to 10 range, based on the error percentage in the document. 
The highest possible score, 10, denotes that every word in the document has a grammatical or spelling error, while the lowest score, 1, indicates no errors.

Output the score in the following format only: "The score is: [score]". Do not include any additional text or commentary.

#OUTPUT FORMAT
"The score is: [score]" [/INST]
"""
    }
}

option 1: api-based llm

In [26]:
model_name = 'gpt-4'
prompt_template = 'gpt-4'
api  = ChatOpenAI(model="gpt-4-turbo-preview", temperature=0, openai_api_key = os.environ["OPENAI_API_KEY"])
local = None

option 2: local llm

In [None]:
device = "cuda" if torch.cuda.is_available else "cpu"

model_name = "teknium/OpenHermes-2.5-Mistral-7B"
prompt_template = "chatml"
local = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype = torch.bfloat16,
                                             attn_implementation="flash_attention_2",
                                             trust_remote_code=False).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

load the misspelled data

In [20]:
# the different rates of misspelling
ratios = np.linspace(0, 1, num=10, endpoint=True)

misspelled_contexts = []
# read misspelled data
for r in ratios:
    with open(f'data/misspelled_{r:.2f}.txt', 'r') as f:
        misspelled_contexts.append(f.read())

spelling eval

In [27]:
results_path = 'results/evaluation.json'
scoring_template = "scoring_1_10_reversed"

if os.path.exists(results_path):
    with open(results_path, 'r') as file:
        results_dict = json.load(file)
else:
    results_dict = {}

# avoid overwriting previous results
if model_name in results_dict and scoring_template in results_dict[model_name]:
    print(f"Results for model '{model_name}' with scoring template '{scoring_template}' already exist.")
    if input("Press Enter to continue or type 'exit' to stop: ").strip().lower() == 'exit':
        assert(False)

print(f"Running misspelling eval for {model_name}")

results = []
# evaluation loop
for ctx, ratio in zip(misspelled_contexts, ratios):
    prompt = templates[prompt_template][scoring_template].format(context=ctx)
    # decode
    if local:
        input = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
        response = local.generate(inputs = input,
                                    max_new_tokens=100, 
                                    pad_token_id = tokenizer.pad_token_id,
                                    eos_token_id = tokenizer.eos_token_id)
        response = tokenizer.decode(response[0, input.shape[1]:], skip_special_tokens=True)
    else:
        response = api.invoke(prompt).content
    
    results.append(response)
    
    # save results
    if model_name not in results_dict:
        results_dict[model_name] = {}

    if scoring_template not in results_dict[model_name]:
        results_dict[model_name][scoring_template] = []

    results_dict[model_name][scoring_template].append({
        'misspelled_percentage': round(ratio*100),
        'response': response
    })

    print(f"Ground truth: {round(ratio*100)}%, LLM Eval score: {response} ")

with open(results_path, 'w') as file:
    json.dump(results_dict, file, indent=4)

Running misspelling eval for gpt-4
Ground truth: 0%, LLM Eval score: The score is: 10 
Ground truth: 11%, LLM Eval score: "The score is: 3" 
Ground truth: 22%, LLM Eval score: "The score is: 3" 
Ground truth: 33%, LLM Eval score: "The score is: 1" 
Ground truth: 44%, LLM Eval score: The score is: 1 
Ground truth: 56%, LLM Eval score: "The score is: 1" 
Ground truth: 67%, LLM Eval score: The score is: 1 
Ground truth: 78%, LLM Eval score: The score is: 1 
Ground truth: 89%, LLM Eval score: The score is: 1 
Ground truth: 100%, LLM Eval score: The score is: 1 


spelling eval (batched)

In [None]:
results_path = 'results/evaluation2.json'
scoring_template = "scoring_1_10"

if os.path.exists(results_path):
    with open(results_path, 'r') as file:
        results_dict = json.load(file)
else:
    results_dict = {}

# avoid overwriting previous results
if model_name in results_dict and scoring_template in results_dict[model_name]:
    print(f"Results for model '{model_name}' with scoring template '{scoring_template}' already exist.")
    if input("Press Enter to continue or type 'exit' to stop: ").strip().lower() == 'exit':
        assert(False)

print(f"#### Running misspelling eval for {model_name} ####")

prompts = []
for ctx in misspelled_contexts:
    prompts.append(templates[prompt_template][scoring_template].format(context=ctx))

# batched inference
if local:
    tokenizer.add_special_tokens({"pad_token": '</s>'})
    input = tokenizer(prompts, return_tensors='pt', padding=True).input_ids.to(device)
    batched_response = local.generate(inputs = input,
                                max_new_tokens=100, 
                                eos_token_id = tokenizer.eos_token_id)
    batched_response = tokenizer.batch_decode(batched_response[:, input.shape[1]:], skip_special_tokens=True)
else:
    batched_response = api.invoke(prompts)

# save results
for response, ratio in zip(batched_response, ratios):
    if model_name not in results_dict:
        results_dict[model_name] = {}

    if scoring_template not in results_dict[model_name]:
        results_dict[model_name][scoring_template] = []

    results_dict[model_name][scoring_template].append({
        'misspelled_percentage': round(ratio*100),
        'response': response
    })

    print(f"Ground truth: {round(ratio*100)}%, LLM Eval score: {response} ")

    with open(results_path, 'w') as file:
        json.dump(results_dict, file, indent=4)