## Quality Assurance Measures

We use kendall tau as well as TP, FP, TN, FN to assess weather our filters are helpful

In [28]:
from sklearn.metrics import confusion_matrix, f1_score
from scipy.stats import kendalltau

def calculate_measures(expert_annotations, llm_annotations, threshold_llm=4, threshold_expert=4):
    tau, p_value = kendalltau(expert_annotations, llm_annotations)
    
    llm_annotations = [1 if y >= threshold_llm else 0 for y in llm_annotations]
    expert_annotations = [1 if y >= threshold_expert else 0 for y in expert_annotations]
    
    tn, fp, fn, tp = confusion_matrix(expert_annotations, llm_annotations).ravel() 
    f1 = f1_score(expert_annotations, llm_annotations)
    
    return tp, fp, tn, fn, f1, tau, p_value

## Utility functions

In [29]:
import re

def get_score(response: str, score: str = "Final"):
    match = re.search(rf"{score} Score:.*?\s*.*?(\d+)", response, re.DOTALL)
    if not match:
        raise ValueError(f"Score not found in the response: {response}")
    return int(match.group(1))

print(get_score("Final Score: 3/5"))

3


# Filtering

In [30]:
FILTERING_PROMPT = """
You are a strict legal expert judging ECHR legal question-answer pairs. The answer might be bad, so be strict!

Question: {question}
Potential Answer: {answer}

You MUST answer each question in full sentences!

The response MUST follow this template:
Comprehensiveness Analysis: {{Go through the answer and analyze how well it answers the question. Does is cover all angles of the question?}}
Comprehensiveness Score: {{A score from 1 (not comprehensive at all) to 5 (extremely comprehensive)}}
Conciseness: {{Is there any part in the answer irrelevant / unrelated to the question? If so, what is unneeded?}}
Conciseness Score: {{A score from 1 (not concise at all) to 5 (extremely concise)}}
Answer Fluency: {{Are there any bad sentence transitions in the answer? Are the sentences ordered correctly? Does the answer start with text clearly continuing previous text that is not there?}}
Answer Fluency Score: {{A score from 1 (not fluent) to 5 (perfectly fluent)}}
"""

In [8]:
from langchain_openai import ChatOpenAI
from langchain.prompts.prompt import PromptTemplate
import pandas as pd

df = pd.read_csv("data/annotated_qa_pairs.csv")

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

expert_annotations = []
llm_annotations = []

for i, row in df.iterrows():
    question = row["question"]
    answer = row["answer"]

    prompt = PromptTemplate.from_template(FILTERING_PROMPT).format(
        question=question, answer=answer
    )

    result = llm.invoke(prompt)

    llm_fluency = get_score(result.content, "Answer Fluency")
    llm_conciseness = get_score(result.content, "Conciseness")
    llm_comprehensiveness = get_score(result.content, "Comprehensiveness")

    fluency = row["answer_fluency"]
    conciseness = row["conciseness"]
    comprehensiveness = row["comprehensiveness"]

    expert_annotation = min(fluency, conciseness, comprehensiveness)
    llm_annotation = min(llm_fluency, llm_conciseness, llm_comprehensiveness)

    print(f"Expert: {expert_annotation}, LLM: {llm_annotation}")

    df.loc[i, "expert_annotation_min"] = expert_annotation
    df.loc[i, "llm_annotation"] = llm_annotation
    df.loc[i, "llm_answer_fluency"] = get_score(result.content, "Answer Fluency")
    df.loc[i, "llm_conciseness"] = get_score(result.content, "Conciseness")
    df.loc[i, "llm_comprehensiveness"] = get_score(result.content, "Comprehensiveness")
    df.loc[i, "llm_response"] = result.content
    df.to_csv("data/annotated_qa_pairs_with_llm_annotations.csv", index=False)

    llm_annotations.append(llm_annotation)
    expert_annotations.append(expert_annotation)

    try:
        tp, fp, tn, fn, f1, tau, p_value = calculate_measures(expert_annotations, llm_annotations)

        print("EXPERT:", expert_annotations)
        print("LLM   :", llm_annotations)
        print(f"F1 Score: {f1}, True Positives: {tp}, False Positives: {fp}, True Negatives: {tn}, False Negatives: {fn}")
        print(f"Kendall's Tau: {tau}, p-value: {p_value}")
        print("Percentage good before filter:", (tp + fn) / (tp + fp + tn + fn))
        print("Percentage good after filter:", tp / (tp + fp))
    except Exception as e:
        print(str(e))

Expert: 2, LLM: 3
not enough values to unpack (expected 4, got 1)




Expert: 2, LLM: 2
not enough values to unpack (expected 4, got 1)




Expert: 2, LLM: 3
not enough values to unpack (expected 4, got 1)




Expert: 2, LLM: 5
EXPERT: [2, 2, 2, 2]
LLM   : [3, 2, 3, 5]
F1 Score: 0.0, True Positives: 0, False Positives: 1, True Negatives: 3, False Negatives: 0
Kendall's Tau: nan, p-value: nan
Percentage good before filter: 0.0
Percentage good after filter: 0.0
Expert: 2, LLM: 3
EXPERT: [2, 2, 2, 2, 2]
LLM   : [3, 2, 3, 5, 3]
F1 Score: 0.0, True Positives: 0, False Positives: 1, True Negatives: 4, False Negatives: 0
Kendall's Tau: nan, p-value: nan
Percentage good before filter: 0.0
Percentage good after filter: 0.0
Expert: 2, LLM: 1
EXPERT: [2, 2, 2, 2, 2, 2]
LLM   : [3, 2, 3, 5, 3, 1]
F1 Score: 0.0, True Positives: 0, False Positives: 1, True Negatives: 5, False Negatives: 0
Kendall's Tau: nan, p-value: nan
Percentage good before filter: 0.0
Percentage good after filter: 0.0
Expert: 2, LLM: 2
EXPERT: [2, 2, 2, 2, 2, 2, 2]
LLM   : [3, 2, 3, 5, 3, 1, 2]
F1 Score: 0.0, True Positives: 0, False Positives: 1, True Negatives: 6, False Negatives: 0
Kendall's Tau: nan, p-value: nan
Percentage good b

## Evaluation of results

In [26]:
import pandas as pd

df = pd.read_csv("data/annotated_qa_pairs_with_llm_annotations - annotated_qa_pairs_with_llm_annotations.csv")

df["expert_annotation_min"] = df[["answer_fluency", "conciseness", "comprehensiveness"]].min(axis=1)

tp, fp, tn, fn, f1, tau, p_value = calculate_measures(df["expert_annotation_min"], df["llm_annotation"], threshold_llm=4, threshold_expert=4)

print("MIN")
print(f"F1 Score: {f1}, True Positives: {tp}, False Positives: {fp}, True Negatives: {tn}, False Negatives: {fn}")
print(f"Kendall's Tau: {tau}, p-value: {p_value}")
print("Percentage good before filter:", (tp + fn) / (tp + fp + tn + fn))
print("Percentage good after filter:", tp / (tp + fp))
print()

tp, fp, tn, fn, f1, tau, p_value = calculate_measures(df["answer_fluency"], df["llm_answer_fluency"], threshold_llm=4, threshold_expert=4)

print("FLUENCY")
print(f"F1 Score: {f1}, True Positives: {tp}, False Positives: {fp}, True Negatives: {tn}, False Negatives: {fn}")
print(f"Kendall's Tau: {tau}, p-value: {p_value}")
print("Percentage good before filter:", (tp + fn) / (tp + fp + tn + fn))
print("Percentage good after filter:", tp / (tp + fp))
print()

tp, fp, tn, fn, f1, tau, p_value = calculate_measures(df["conciseness"], df["llm_conciseness"], threshold_llm=4, threshold_expert=4)

print("CONCISENESS")
print(f"F1 Score: {f1}, True Positives: {tp}, False Positives: {fp}, True Negatives: {tn}, False Negatives: {fn}")
print(f"Kendall's Tau: {tau}, p-value: {p_value}")
print("Percentage good before filter:", (tp + fn) / (tp + fp + tn + fn))
print("Percentage good after filter:", tp / (tp + fp))
print()

tp, fp, tn, fn, f1, tau, p_value = calculate_measures(df["comprehensiveness"], df["llm_comprehensiveness"], threshold_llm=4, threshold_expert=4)

print("COMPREHENSIVENESS")
print(f"F1 Score: {f1}, True Positives: {tp}, False Positives: {fp}, True Negatives: {tn}, False Negatives: {fn}")
print(f"Kendall's Tau: {tau}, p-value: {p_value}")
print("Percentage good before filter:", (tp + fn) / (tp + fp + tn + fn))
print("Percentage good after filter:", tp / (tp + fp))


MIN
F1 Score: 0.8364611260053619, True Positives: 156, False Positives: 35, True Negatives: 83, False Negatives: 26
Kendall's Tau: 0.600139370503958, p-value: 1.576726184121723e-31
Percentage good before filter: 0.6066666666666667
Percentage good after filter: 0.8167539267015707

FLUENCY
F1 Score: 0.9914821124361158, True Positives: 291, False Positives: 3, True Negatives: 4, False Negatives: 2
Kendall's Tau: 0.611003630633088, p-value: 4.2520713324491405e-27
Percentage good before filter: 0.9766666666666667
Percentage good after filter: 0.9897959183673469

CONCISENESS
F1 Score: 0.9354120267260579, True Positives: 210, False Positives: 5, True Negatives: 61, False Negatives: 24
Kendall's Tau: 0.7105821436416258, p-value: 5.891416699746008e-39
Percentage good before filter: 0.78
Percentage good after filter: 0.9767441860465116

COMPREHENSIVENESS
F1 Score: 0.9123505976095617, True Positives: 229, False Positives: 40, True Negatives: 27, False Negatives: 4
Kendall's Tau: 0.448607607202244