In [17]:
import os
import pandas as pd
import ast

In [3]:
# Read OpenAI key
key = open('openai_key.txt', 'r').read()
os.environ["OPENAI_API_KEY"] = key

### GPT4 Question-Answer Pairs Generation (Ground Truth)

In [5]:
summ_test = pd.read_csv("summ_test.csv", delimiter = ',')

In [6]:
def text_annotation(row):
    if row["IRC_type"] != "Non_IRC":
        left_label = "<"+row["IRC_type"]+">"
        right_label = "</"+row["IRC_type"]+">"
        annotated_sent = left_label + row["sentence"] + right_label
    else:
        annotated_sent = row["sentence"]
    return annotated_sent

In [7]:
summ_test["annotated_sentence"] = summ_test.apply(lambda x: text_annotation(x), axis = 1)

In [8]:
summ_test_grouped = summ_test.groupby("name")[["sentence", "annotated_sentence"]].agg(lambda x: " ".join(x)).reset_index().rename(columns = {"sentence": "summary", "annotated_sentence": "annotated_summary"})

In [9]:
qa_prompt = """
    Act like a legal professional and read the following legal text that delimitered by three backticks. Use Issue, Reason and Conclusion sentences to generate question-answer pairs.

    List those generated questions and answers in the following format and put it in a json format:

    Question1:...
    Type1:...
    Answer1:...

    Question2:...
    Type2:...
    Answer2:...

    Question3:...
    Type3:...
    Answer3:...

    ```%s```
    """

In [14]:
def qa_generation(prompt_template, summary, model='gpt-4'):
    qa_prompt = prompt_template %summary
    messages = [{"role": "user", "content": qa_prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

In [18]:
qs = []
ans = []
q_types = []
names = []
for idx, row in summ_test_grouped[:15].iterrows():
    summary = row["annotated_summary"]
    name = row["name"]
    response = qa_generation(qa_prompt, summary)
    literal_response = ast.literal_eval(response)
    names.append(name)
    for key,value in literal_response.items():        
        if "Question" in key:
            qs.append(qa[key])
        elif "Answer" in key:
            ans.append(qa[key])
        else:
            q_types.append(qa[key])

0
name                                                1987canlii2373.txt
summary              Warrant issued to search a dwelling house for ...
annotated_summary    <Issue>Warrant issued to search a dwelling hou...
Name: 0, dtype: object


### GPT4 Answer Prediction

In [20]:
from langchain.chains import LLMChain
from langchain.evaluation.qa import QAEvalChain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

In [21]:
llm = ChatOpenAI(model_name = 'gpt-4',temperature=0)

In [None]:
# Insert your own generated legal summary (csv format)
generated_summaries = pd.read_csv("...csv")

In [None]:
def predict_answer(llm, context, q, ans, ttype, name):
    eval_chain = QAEvalChain.from_llm(llm)
    context_examples = [{"question": q, "context": context, "type": ttype}]
    QA_PROMPT = "Answer the {type} type of question based on the context\nContext:{context}\nQuestion:{question}\nAnswer:"
    template = PromptTemplate(input_variables=["type", "context", "question"], template=QA_PROMPT)
    qa_chain = LLMChain(llm=llm, prompt=template)
    pred = qa_chain.apply(context_examples)
    return pred[0]['text']

In [None]:
preds = []
for i, row in generated_summaries.iterrows():
    name = row['name']
    q = row['question']
    ans = row['ground_truth_answer']
    ttype = row['IRC_type']
    context = row['model_summary']
    pred = predict_answer(llm, context, q, ans, ttype, name)
    preds.append(pred)

### GPT4 Answer Evaluation

In [22]:
EVAL_PROMPT = """You are a legal expert to judge the answers to questions.
You are judging the following question:
{query}
Here is the real answer:
{answer}
You are grading the following predicted answer:
{result}
What grade do you give from 0 to 10, where 0 is the lowest (can't find the answer) and 10 is the highest (very close to the real answer)?

Finally, give then explain the reason of the grade in the following format:
Explanation: <explain>.

"""

In [23]:
def evaluate_answer(llm, prompt_template, qs, ans, types, names, preds):
    examples = [{"question": qs[i], "answer": preds[i]['text']} for i in range(len(qs))]
    ans = [{'text': a} for a in ans]
    
    PROMPT = PromptTemplate(input_variables=["query", "answer", "result"], template=prompt_template)    
    eval_chain = QAEvalChain.from_llm(llm,prompt=PROMPT)
    graded_outputs = eval_chain.evaluate(examples, ans, question_key="question", prediction_key="text")

    questions, real_answers, summary_answers, graded = [], [], [], []
    for i, eg in enumerate(examples):
        questions.append(eg['question'])
        real_answers.append(eg['answer'])
        summary_answers.append(ans[i]['text'])
        graded.append(graded_outputs[i]['text'])
    returned_dict = {"name": names,
                     "question": questions, 
                     "IRC_type": types,
                     "real_answer": real_answers, 
                     "summary_answer": summary_answers,
                     "grade_output": graded
                     }
    return returned_dict

In [None]:
# post-process the returned result
res_dict = evalute_answer(llm, EVAL_PROMPT, qs, ans, types, names, preds)
df = pd.DataFrame(res_dict)   
df['grade_output'] = df['grade_output'].apply(lambda x: re.split(r'\n{1,2}', x))    
df['grade'] = df['grade_output'].apply(lambda x: x[0])
df['explanation'] = df['grade_output'].apply(lambda x: x[1])