## Quality Assurance Measures

We use kendall tau as well as TP, FP, TN, FN to assess weather our filters are helpful

In [None]:
from sklearn.metrics import confusion_matrix, f1_score
from scipy.stats import kendalltau

def calculate_measures(expert_annotations, llm_annotations, threshold_llm=4, threshold_expert=4):
    tau, p_value = None, None
    try:
        tau, p_value = kendalltau(expert_annotations, llm_annotations)
    except:
        pass
    
    llm_annotations = [1 if y >= threshold_llm else 0 for y in llm_annotations]
    expert_annotations = [1 if y >= threshold_expert else 0 for y in expert_annotations]
    
    tn, fp, fn, tp = confusion_matrix(expert_annotations, llm_annotations).ravel() 
    f1 = f1_score(expert_annotations, llm_annotations)
    
    return tp, fp, tn, fn, f1, tau, p_value

## Utility functions

In [None]:
import re

def get_score(response: str, score: str = "Final"):
    match = re.search(rf"{score} Score:.*?\s*.*?(\d+)", response, re.DOTALL)
    if not match:
        raise ValueError(f"Score not found in the response: {response}")
    return int(match.group(1))

print(get_score("Final Score: 3/5"))

# Filtering

In [None]:
FILTERING_PROMPT = """
You are a strict legal expert judging ECHR legal question-answer pairs. The answer might be bad, so be strict!

Question: {question}
Potential Answer: {answer}

You MUST answer each question in full sentences!

The response MUST follow this template:
Comprehensiveness Analysis: {{Go through the answer and analyze how well it answers the question. Does is cover all angles of the question?}}
Comprehensiveness Score: {{A score from 1 (not comprehensive at all) to 5 (extremely comprehensive)}}
Conciseness: {{Is there any part in the answer irrelevant / unrelated to the question? If so, what is unneeded?}}
Conciseness Score: {{A score from 1 (not concise at all) to 5 (extremely concise)}}
Answer Fluency: {{Are there any bad sentence transitions in the answer? Are the sentences ordered correctly? Does the answer start with text clearly continuing previous text that is not there?}}
Answer Fluency Score: {{A score from 1 (not fluent) to 5 (perfectly fluent)}}
"""

In [None]:
from langchain_openai import ChatOpenAI
from langchain.prompts.prompt import PromptTemplate
import pandas as pd

df = pd.read_csv("data/annotated_qa_pairs.csv")

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

expert_annotations = []
llm_annotations = []

for i, row in df.iterrows():
    question = row["question"]
    answer = row["answer"]

    prompt = PromptTemplate.from_template(FILTERING_PROMPT).format(
        question=question, answer=answer
    )

    result = llm.invoke(prompt)

    llm_fluency = get_score(result.content, "Answer Fluency")
    llm_conciseness = get_score(result.content, "Conciseness")
    llm_comprehensiveness = get_score(result.content, "Comprehensiveness")

    fluency = row["answer_fluency"]
    conciseness = row["conciseness"]
    comprehensiveness = row["comprehensiveness"]

    expert_annotation = min(fluency, conciseness, comprehensiveness)
    llm_annotation = min(llm_fluency, llm_conciseness, llm_comprehensiveness)

    print(f"Expert: {expert_annotation}, LLM: {llm_annotation}")

    df.loc[i, "expert_annotation_min"] = expert_annotation
    df.loc[i, "llm_annotation"] = llm_annotation
    df.loc[i, "llm_answer_fluency"] = get_score(result.content, "Answer Fluency")
    df.loc[i, "llm_conciseness"] = get_score(result.content, "Conciseness")
    df.loc[i, "llm_comprehensiveness"] = get_score(result.content, "Comprehensiveness")
    df.loc[i, "llm_response"] = result.content
    df.to_csv("data/annotated_qa_pairs_with_llm_annotations.csv", index=False)

    llm_annotations.append(llm_annotation)
    expert_annotations.append(expert_annotation)

    try:
        tp, fp, tn, fn, f1, tau, p_value = calculate_measures(expert_annotations, llm_annotations)

        print("EXPERT:", expert_annotations)
        print("LLM   :", llm_annotations)
        print(f"F1 Score: {f1}, True Positives: {tp}, False Positives: {fp}, True Negatives: {tn}, False Negatives: {fn}")
        print(f"Kendall's Tau: {tau}, p-value: {p_value}")
        print("Percentage good before filter:", (tp + fn) / (tp + fp + tn + fn))
        print("Percentage good after filter:", tp / (tp + fp))
    except Exception as e:
        print(str(e))

## Evaluation of results

In [None]:
import pandas as pd

df = pd.read_csv("data/annotated_qa_pairs_with_llm_annotations.csv")

print(df["prompt_id"].value_counts())

# df = df[(df["prompt_id"] != "single-paragraph") & (df["prompt_id"] != "multiple-paragraphs") & (df["prompt_id"] != "sentence-level-cot")]

tp, fp, tn, fn, f1, tau, p_value = calculate_measures(df["expert_annotation_min"], df["llm_annotation"], threshold_llm=4, threshold_expert=4)

print("OVERALL")
print(f"& {f1} & {tp} & {fp} & {tn} & {fn} & {tau} & {p_value} \\\\")
print("Percentage good before filter:", (tp + fn) / (tp + fp + tn + fn))
print("Percentage good after filter:", tp / (tp + fp))
print()

tp, fp, tn, fn, f1, tau, p_value = calculate_measures(df["conciseness"], df["llm_conciseness"], threshold_llm=4, threshold_expert=4)

print("CONCISENESS")
print(f"& {f1} & {tp} & {fp} & {tn} & {fn} & {tau} & {p_value} \\\\")
print("Percentage good before filter:", (tp + fn) / (tp + fp + tn + fn))
print("Percentage good after filter:", tp / (tp + fp))
print()

tp, fp, tn, fn, f1, tau, p_value = calculate_measures(df["comprehensiveness"], df["llm_comprehensiveness"], threshold_llm=4, threshold_expert=4)

print("COMPREHENSIVENESS")
print(f"& {f1} & {tp} & {fp} & {tn} & {fn} & {tau} & {p_value} \\\\")
print("Percentage good before filter:", (tp + fn) / (tp + fp + tn + fn))
print("Percentage good after filter:", tp / (tp + fp))


tp, fp, tn, fn, f1, tau, p_value = calculate_measures(df["answer_fluency"], df["llm_answer_fluency"], threshold_llm=4, threshold_expert=4)

print("FLUENCY")
print(f"& {f1} & {tp} & {fp} & {tn} & {fn} & {tau} & {p_value} \\\\")
print("Percentage good before filter:", (tp + fn) / (tp + fp + tn + fn))
print("Percentage good after filter:", tp / (tp + fp))
print()
