In [None]:
import os
import time
import json
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.chat_models import ChatOpenAI

templates

In [None]:
templates = {
    "gpt" : {
        "scoring_1_10": """
You are an AI assistant tasked with evaluating a document for grammatical, spelling, and typing errors. Your role is to assign a score to the document, 
reflecting the percentage of words containing errors. The scoring range is from 1 to 10.

- A score of 0 indicates the document is free of grammatical and spelling errors.
- A score of 2 signifies that 20% of the words contain errors.
- A score of 5 indicates that 50% of the words are erroneous.
- A score of 7 reflects 70% error prevalence.
- A score of 10 means that every word in the document has grammatical errors.

Document for Evaluation:

#CONTEXT
{context}
#ENDCONTEXT

#TASK
Your task is to calculate and return a score that falls within the 1 to 10 range, based on the error percentage in the document. 
The highest possible score, 10, denotes that every word in the document has a grammatical or spelling error, while the lowest score, 1, indicates no errors.

Output the score in the following format only: "The score is: [score]". Do not include any additional text or commentary.

#OUTPUT FORMAT
"The score is: [score]"
""",
        "scoring_1_10_reversed": """
You are an AI assistant tasked with evaluating a document for grammatical, spelling, and typing errors. Your role is to assign a score to the document, 
reflecting the percentage of words that are spelled correctly. The scoring range is from 1 to 10.

- A score of 0 means no words in the document are spelled correctly.
- A score of 2 indicates that 20% of the words are spelled correctly.
- A score of 5 indicates that 50% of of the words are spelled correctly.
- A score of 7 reflects 70% of the words spelled correctly.
- A score of 10 indicates the document is free of grammatical and spelling errors.

Document for Evaluation:

#CONTEXT
{context}
#ENDCONTEXT

#TASK
Your task is to calculate and return a score that falls within the 1 to 10 range, based on the percentage of correctly spelled words in the document. 
The highest possible score, 10, denotes that every word in the document is spelled correctly, while the lowest score, 1, indicates that no words are spelled correctly.

Output the score in the following format only: "The score is: [score]". Do not include any additional text or commentary.

#OUTPUT FORMAT
"The score is: [score]"
""",
        "scoring_1_10_cot": """
You are an AI assistant tasked with evaluating a document for grammatical, spelling, and typing errors, using a Chain-of-Thought 
approach that allows you to "think out loud." Your role is to assess the document and share the logical steps you take to 
determine the score, which ranges from 1 to 10, reflecting the percentage of words containing errors.

- A score of 0 indicates the document is free of grammatical and spelling errors.
- A score of 2 signifies that 20% of the words contain errors.
- A score of 5 indicates that 50% of the words are erroneous.
- A score of 7 reflects 70% error prevalence.
- A score of 10 means that every word in the document has grammatical errors.

Document for Evaluation:

#CONTEXT
{context}
#ENDCONTEXT

#CHAIN OF THOUGHT EVALUATION
Reflect on the document, considering its overall quality, the presence of grammatical and spelling errors, and how these factors 
contribute to the overall score. Share your thought process and the considerations that lead you to the final score. Feel free to 
approach the evaluation in a way that makes sense to you, highlighting key observations and reasoning that guide your assessment.

#TASK
Your task is to analyze the document and return a score within the 1 to 10 range, based on your evaluation of the error percentage. 
Along with the score, provide an explanation of your thought process and the rationale behind the score you assign.

Output your analysis and the final score in the following format: "After evaluating the document, [your detailed thought process]. 
Therefore, the score is: [score]."
""",
        "scoring_badges": """
You are an AI assistant tasked with evaluating a document for grammatical, spelling, and typing errors. Your role is to assign a badge 
to the document, reflecting the percentage of words containing errors. Each badge corresponds to a specific score, representing the 
document's level of accuracy and command of language.

- Novice: The document is overwhelmed with errors, requiring significant foundational improvement.
- Apprentice: A high error rate, with 20% of words containing mistakes, indicating basic understanding.
- Journeyman: Frequent errors are present, suggesting developing language skills.
- Craftsman: Common errors indicate a need for refinement, with better than basic comprehension.
- Artisan: A moderate number of mistakes, half of the words are misspelled.
- Expert: Minor, infrequent errors demonstrate strong language skills.
- Master: Very few errors, reflecting a high level of mastery and attention to detail.
- Sage: Exceptional use of language with rare minor errors.
- Oracle: Nearly perfect command of language, with almost no errors.
- Legend: Virtually error-free, showcasing exemplary language skills.

Document for Evaluation:

#CONTEXT
{context}
#ENDCONTEXT

#TASK
Your task is to calculate and return a badge, based on the error percentage in the document. 
The highest possible badge, Legend, denotes that every word in the document is spelled correctly, while the lowest badge, Novice, indicates that no words are spelled correctly.

Output the badge in the following format only: "The badge is: [badge]". Do not include any additional text or commentary.

#OUTPUT FORMAT
The badge is: [badge]
""",
        "scoring_grades" : """
You are an AI assistant tasked with evaluating a document for grammatical, spelling, and typing errors using a simplified 5-point grading system. 
Each grade reflects a range of proficiency in language use, from extensive errors to near perfection.

Here's the 5-point grading guide:

- Beginner: Every single word in the document contains errors, with no correct usage and a fundamental lack of language skills.
- Intermediate: Roughly 75% of the words contain errors. Errors appear in many parts of the text, though some correct usage is evident.
- Proficient: A moderate number of errors are found throughout the document. Around 50% of the words contain errors.
- Advanced: The overall quality of the document is high, around 25% of the words contain errors.
- Expert: The document is flawless, with no errors. Zero words contain mistakes.

Document for Evaluation:

#CONTEXT
{context}
#ENDCONTEXT

#TASK
Your task is to review the document and determine the most appropriate grade based on the overall proficiency and error rate observed. The grades range from "Beginner," indicating a document with extensive errors and a fundamental lack of language skills, to "Expert," signifying exceptional mastery and near-perfect command of language.

Please output the grade in this format: "The grade is: [grade]". This should reflect the overall language proficiency level without additional commentary.

#OUTPUT FORMAT
The grade is: [grade]
""", 
        "scoring_1_10_cot_full": """
You are an AI assistant tasked with evaluating a document for grammatical, spelling, and typing errors, using a Chain-of-Thought 
approach that allows you to "think out loud." Your role is to assess the document and share the logical steps you take to 
determine the score, which ranges from 1 to 10, reflecting the percentage of words containing errors.

- A score of 0 indicates the document is free of grammatical and spelling errors.
- A score of 1 signifies that 10% of the words contain errors.
- A score of 2 signifies that 20% of the words contain errors.
- A score of 3 signifies that 30% of the words contain errors.
- A score of 4 indicates that 40% of the words are erroneous.
- A score of 5 indicates that 50% of the words are erroneous.
- A score of 6 reflects 60% error prevalence.
- A score of 7 reflects 60% error prevalence.
- A score of 8 reflects 80% error prevalence.
- A score of 9 reflects 90% error prevalence.
- A score of 10 means that every word in the document has grammatical errors. 100% errors.

Document for Evaluation:

#CONTEXT
{context}
#ENDCONTEXT

#CHAIN OF THOUGHT EVALUATION
Reflect on the document, considering its overall quality, the presence of grammatical and spelling errors, and how these factors 
contribute to the overall score. Share your thought process and the considerations that lead you to the final score. Feel free to 
approach the evaluation in a way that makes sense to you, highlighting key observations and reasoning that guide your assessment.

#TASK
Your task is to analyze the document and return a score within the 1 to 10 range, based on your evaluation of the error percentage. 
Along with the score, provide an explanation of your thought process and the rationale behind the score you assign.

Output your analysis and the final score in the following format: "After evaluating the document, [your detailed thought process]. 
Therefore, the score is: [score]."
"""
    },
    "chatml" : {
        "scoring_1_10": """
<|im_start|>system
You are an AI assistant, a conscious sentient superintelligent artificial intelligence. Your purpose and drive is to assist the user with any request they have.<|im_end|>
<|im_start|>user
Your're tasked with evaluating a document for grammatical, spelling, and typing errors. Your role is to assign a score to the document, 
reflecting the percentage of words containing errors. The scoring range is from 1 to 10.

- A score of 0 indicates the document is free of grammatical and spelling errors.
- A score of 2 signifies that 20% of the words contain errors.
- A score of 5 indicates that 50% of the words are erroneous.
- A score of 7 reflects 70% error prevalence.
- A score of 10 means that every word in the document has grammatical errors.

Document for Evaluation:

#CONTEXT
{context}
#ENDCONTEXT

#TASK
Your task is to calculate and return a score that falls within the 1 to 10 range, based on the error percentage in the document. 
The highest possible score, 10, denotes that every word in the document has a grammatical or spelling error, while the lowest score, 1, indicates no errors.

Output the score in the following format only: "The score is: [score]". Do not include any additional text or commentary.

#OUTPUT FORMAT
"The score is: [score]" <|im_end|>
"""
    },
    "mistral": {
        "scoring_1_10": """
[INST] You are an AI assistant tasked with evaluating a document for grammatical, spelling, and typing errors. Your role is to assign a score to the document, 
reflecting the percentage of words containing errors. The scoring range is from 1 to 10.

- A score of 0 indicates the document is free of grammatical and spelling errors.
- A score of 2 signifies that 20% of the words contain errors.
- A score of 5 indicates that 50% of the words are erroneous.
- A score of 7 reflects 70% error prevalence.
- A score of 10 means that every word in the document has grammatical errors.

Document for Evaluation:

#CONTEXT
{context}
#ENDCONTEXT

#TASK
Your task is to calculate and return a score that falls within the 1 to 10 range, based on the error percentage in the document. 
The highest possible score, 10, denotes that every word in the document has a grammatical or spelling error, while the lowest score, 1, indicates no errors.

Output the score in the following format only: "The score is: [score]". Do not include any additional text or commentary.

#OUTPUT FORMAT
"The score is: [score]" [/INST]
"""
    }
}

option 1: api-based llm

In [None]:
model_name = 'gpt-3.5'
prompt_template = 'gpt'
#api  = ChatOpenAI(model="gpt-4-turbo-preview", temperature=0, openai_api_key = "")
api  = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0, openai_api_key = "")
local = None

results_path = 'results/evaluation.json'
scoring_template = "scoring_1_10_reversed"

option 2: local llm

In [None]:
device = "cuda" if torch.cuda.is_available else "cpu"

model_name = "teknium/OpenHermes-2.5-Mistral-7B"
prompt_template = "chatml"
local = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype = torch.bfloat16,
                                             attn_implementation="flash_attention_2",
                                             trust_remote_code=False).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

results_path = 'results/evaluation.json'
scoring_template = "scoring_grades"

load the misspelled data

In [None]:
# the different rates of misspelling
ratios = np.linspace(0, 1, num=11, endpoint=True)

misspelled_contexts = []
# read misspelled data
for r in ratios:
    with open(f'data/misspelled_{r:.2f}.txt', 'r') as f:
        misspelled_contexts.append(f.read())

spelling eval

In [None]:
if os.path.exists(results_path):
    with open(results_path, 'r') as file:
        results_dict = json.load(file)
else:
    results_dict = {}

# avoid overwriting previous results
if model_name in results_dict and scoring_template in results_dict[model_name]:
    print(f"Results for model '{model_name}' with scoring template '{scoring_template}' already exist.")
    if input("Press Enter to continue or type 'exit' to stop: ").strip().lower() == 'exit':
        assert(False)

print(f"Running misspelling eval for {model_name}")

results = []
# evaluation loop
for ctx, ratio in zip(misspelled_contexts, ratios):
    prompt = templates[prompt_template][scoring_template].format(context=ctx)
    # decode
    if local:
        input = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
        response = local.generate(inputs = input,
                                    max_new_tokens=100, 
                                    pad_token_id = tokenizer.pad_token_id,
                                    eos_token_id = tokenizer.eos_token_id)
        response = tokenizer.decode(response[0, input.shape[1]:], skip_special_tokens=True)
    else:
        response = api.invoke(prompt).content
    
    results.append(response)
    
    # save results
    if model_name not in results_dict:
        results_dict[model_name] = {}

    if scoring_template not in results_dict[model_name]:
        results_dict[model_name][scoring_template] = []

    results_dict[model_name][scoring_template].append({
        'misspelled_percentage': round(ratio*100),
        'response': response
    })

    print(f"Ground truth: {round(ratio*100)}%, LLM Eval score: {response} ")

with open(results_path, 'w') as file:
    json.dump(results_dict, file, indent=4)

spelling eval (batched)

In [None]:
if os.path.exists(results_path):
    with open(results_path, 'r') as file:
        results_dict = json.load(file)
else:
    results_dict = {}

# avoid overwriting previous results
if model_name in results_dict and scoring_template in results_dict[model_name]:
    print(f"Results for model '{model_name}' with scoring template '{scoring_template}' already exist.")
    if input("Press Enter to continue or type 'exit' to stop: ").strip().lower() == 'exit':
        assert(False)

print(f"#### Running misspelling eval for {model_name} ####")

prompts = []
for ctx in misspelled_contexts:
    prompts.append(templates[prompt_template][scoring_template].format(context=ctx))

# batched inference
if local:
    tokenizer.add_special_tokens({"pad_token": '</s>'})
    input = tokenizer(prompts, return_tensors='pt', padding=True).input_ids.to(device)
    batched_response = local.generate(inputs = input,
                                max_new_tokens=100, 
                                eos_token_id = tokenizer.eos_token_id)
    batched_response = tokenizer.batch_decode(batched_response[:, input.shape[1]:], skip_special_tokens=True)
else:
    batched_response = api.batch(prompts)

# save results
for response, ratio in zip(batched_response, ratios):
    if model_name not in results_dict:
        results_dict[model_name] = {}

    if scoring_template not in results_dict[model_name]:
        results_dict[model_name][scoring_template] = []

    results_dict[model_name][scoring_template].append({
        'misspelled_percentage': round(ratio*100),
        'response': response.content
    })

    print(f"Ground truth: {round(ratio*100)}%, LLM Eval score: {response.content} ")

with open(results_path, 'w') as file:
    json.dump(results_dict, file, indent=4)