SETUP

In [None]:
# %pip install trulens-eval==0.12.0 llama_index==0.8.4 pymilvus==2.3.0 nltk==3.8.1 html2text==2020.1.16 tenacity==8.2.3 --quiet
# %pip install wikipedia transformers sentence-transformers --quiet
# %pip install openai==0.28

In [None]:
from dotenv import load_dotenv
from llama_index import Document, VectorStoreIndex, ServiceContext
import pandas as pd
from llama_index.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from llama_index.vector_stores import MilvusVectorStore
from trulens_eval import Tru, TruLlama, Feedback
from trulens_eval.feedback import Groundedness
import numpy as np
from llama_index.storage.storage_context import StorageContext
from tenacity import retry, stop_after_attempt, wait_exponential
import pickle
import os

In [None]:
# example prompts
PROMPTS = [
    "I am arguing a case in front of Judge Callahan soon. My client believes his first and fifth amendment rights were violated. What arguments about this issue has Judge Callahan found compelling in other cases?"
    # "What are the primary issues and concerns that the court has dealt with when it comes to cases brought forth by tribal nations?"
    # "What are the prevailing opinions and thoughts regarding the possession of a firearm in the Missoula district court?"
    # "My client crossed state lines to perform an illegal action. Based on USA vs. Michael Pepe, what are some arguments I should make?"
    # "What do past cases in this district indicate about the prevailing belief/precedent in regards to the death penalty?"
    # "I plan to challenge the constitutionality of a current law under the Second Amendment. What arguments have been effective or convincing on this issue before?"
    # "I am arguing a case against a technology company. What do past cases indicate regarding key concerns when arguing against a company?"
    # "I am arguing a free speech/public fora case. My client tried distributing tokens in a Free Speech Zone. What have past cases said about this? Mention the names of these cases specifically."
    # "I am arguing in front of Judge Sanchez soon. How has he previously voted in immigration appeal cases?"
]

generated_responses = []

PER_CASE_PROMPT_ADDENDUM = """Please mention the name and date of any relevant cases in your response. 
                If no relevant information is found in the context, set [response] below to "NO RESPONSE".
                    Format responses, as so:
                            RESPONSE: [response]
                            SOURCE: [case_name of the document the response was formulated from]
                    Also, the audience consists of lawyers, so use legal jargon and a formal tone. Your final answer should be under 3000 characters."""
SYNTHESIZED_PROMPT_ADDENDUM = "In your answer, focus on the reasoning and principles used in prior cases. Also mention how cases cited are relevant to the case mentioned in the user's query. Integrate the name of the sources used into your response."


load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPEN_AI_KEY")

CREATE DOCUMENTS FOR EACH CASE SUMMARY

In [None]:
# Create metadata for each case.

cases_df = pd.read_pickle("./cases.pkl")
documents = []
for index, row in cases_df.iterrows():
    doc_text = """
    CASE NAME: {}
    JUDGE: {}
    CASE SUMMARY: {}    
    """.format(row["case_name"], row["judge"], row["case_summary"])
    doc = Document(
        text=doc_text,
        metadata={
            'case_name':row["case name"],
            'case_number':row["case number"],
            'case_origin':row["case origin"],
            'judge':row["authoring judge"],
            'case_type':row["case type"],
            'date':row["date filed"]
        }
    )
    documents.append(doc)

INITIALIZE VECTOR STORE

In [None]:
llm = OpenAI()
embed_model = OpenAIEmbeddings()

In [None]:
#Initialize MilvusVectorStore

vector_store = MilvusVectorStore(
    index_params= {
        "index_type": "IVF_FLAT",
        "metric_type": "L2",
        "params":{"nlist":1024, "nprobe":100}
    },
    search_params={},
    overwrite=True)

In [None]:
# Build index

storage_context = StorageContext.from_defaults(vector_store = vector_store)
service_context = ServiceContext.from_defaults(embed_model = embed_model, llm = llm)
index = VectorStoreIndex.from_documents(documents,
            service_context=service_context,
            storage_context=storage_context)

In [None]:
# Initialize TruLens
tru = Tru()

# Initialize OpenAI-based feedback function collection class
openai_gpt35 = feedback.OpenAI(model_engine="gpt-3.5-turbo")

# Initialize groundedness class for the groundedness metric
grounded = Groundedness(groundedness_provider=openai_gpt35)

In [None]:
f_answer_relevance = Feedback(openai_gpt35.relevance_with_cot_reasons, name = "Answer Relevance").on_input_output()

In [None]:
f_groundedness = Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness").on(
    TruLlama.select_source_nodes().node.text # this line grabs the context that was supplied with the query
).on_output().aggregate(grounded.grounded_statements_aggregator)

In [None]:
f_context_relevance = Feedback(openai_gpt35.qs_relevance_with_cot_reasons, name = "Context Relevance").on_input().on(TruLlama.select_source_nodes().node.text).aggregate(np.mean)

CHOOSE TOP K RELEVANT CASE SUMMARIES

In [None]:
query_engine = index.as_query_engine(similarity_top_k = 5)
retriever = index.as_retriever(similarity_top_k=5)
    
# Initialize a TruLlama wrapper to connect evaluation metrics with the query engine
tru_query_engine = TruLlama(query_engine,
                    feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance],
                    metadata={
                        'index_param':'',
                        'embed_model':"top k cases",
                        'top_k':5,
                        'chunk_size':200
                        })
    
@retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=4, max=10))

def call_tru_retriever_engine(prompt):
    return retriever.retrieve(prompt)

prompt_to_cases_dict = {}
for prompt in PROMPTS:
    print(f"Prompt: {prompt}")
    # print(f"Response: {call_tru_query_engine(prompt)}\n")
    prompt_to_cases_dict[prompt] = []
    nodes = call_tru_retriever_engine(prompt)
    for node in nodes:
        print("case name: {}, score: {}".format(node.node.metadata["case_name"], str(node.score)))
        if node.score < 0.4:
            prompt_to_cases_dict[prompt].append(node.node.metadata["case_name"])

RETREIEVE FULL OPINION TEXT AND GENERATE RESPONSE FOR EACH TOP K CASE
- Place each case in separate vector db
- Use rag to generate response to prompt from only this vector DB

In [None]:
@retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=4, max=10))
def call_tru_query_engine(tru_query_engine, prompt):
        # we now send the prompt through the TruLlama-wrapped query engine
    return tru_query_engine.query(prompt)

responses = {}
for prompt in prompt_to_cases_dict:
    cases = prompt_to_cases_dict[prompt]
    responses[prompt] = []
    for case in cases:
        row = cases_df.loc[cases_df["case name"] == case].iloc[0]

        opinion = row["opinion text"]
       
        # create document with full opinion text and add to vector store
        doc = Document(
                text=opinion,
                metadata={
                    'case_name':row["case name"],
                    'case_number':row["case number"],
                    'case_origin':row["case origin"],
                    'judge':row["authoring judge"],
                    'case_type':row["case type"],
                    'date':row["date filed"]
                }
        )
        vector_store_two = MilvusVectorStore(
            index_params= {
                "index_type": "IVF_FLAT",
                "metric_type": "L2",
                "params":{"nlist":1024, "nprobe":100}
            },
            search_params={},
            overwrite=True)

        storage_context = StorageContext.from_defaults(vector_store = vector_store_two)
        service_context = ServiceContext.from_defaults(embed_model = embed_model, llm = llm)
        index = VectorStoreIndex.from_documents([doc],
            service_context=service_context,
            storage_context=storage_context)
        query_engine = index.as_query_engine(similarity_top_k = 5)

        # query rag to get the response to the prompt per case
        tru_query_engine = TruLlama(query_engine,
                        feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance],
                        metadata={
                            'index_param':f"Prompt index: {PROMPTS.index(prompt)}, case name: {row['case name']}",
                            'embed_model':"per case response",
                            'top_k':5,
                            'chunk_size':1000
                            })
        response = call_tru_query_engine(tru_query_engine, prompt + PER_CASE_PROMPT_ADDENDUM)
        responses[prompt].append(response)


USE RAG TO SYNTHESIZE 5 RESPONSES INTO 1 FINAL RESPONSE

In [None]:
for prompt in responses:
    vector_store_three = MilvusVectorStore(
                index_params= {
                    "index_type": "IVF_FLAT",
                    "metric_type": "L2",
                    "params":{"nlist":1024, "nprobe":100}
                },
                search_params={},
                overwrite=True)

    storage_context = StorageContext.from_defaults(vector_store = vector_store_three)
    service_context = ServiceContext.from_defaults(embed_model = embed_model, llm = llm)
    documents = []
    for response in responses[prompt]:
        if "NO RESPONSE" in str(response):
            continue
        doc = Document(
                    text=str(response),
            )
        documents.append(doc)
    index = VectorStoreIndex.from_documents(documents,
                service_context=service_context,
                storage_context=storage_context)
    query_engine = index.as_query_engine(similarity_top_k = 5)

    tru_query_engine = TruLlama(query_engine,
                            feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance],
                            metadata={
                                'index_param':'',
                                'embed_model':"synthesis",
                                'top_k':5,
                                'chunk_size':3000
                                })
    query_response = call_tru_query_engine(tru_query_engine, prompt + SYNTHESIZED_PROMPT_ADDENDUM)
    generated_responses.append(str(query_response))

TRUERA EVALUATION

In [None]:
tru.run_dashboard()

In [None]:
for response in PROMPTS:
    print(response)
    print("----")

PICKLE FOR BREADTH EVALUATION

In [None]:
response_to_summaries = {}

for response in generated_responses:
    ind = generated_responses.index(response)
    prompt = generated_responses[ind]
    response_to_summaries[str(response)] = [str(x) for x in responses[prompt]]

with open("generated_responses.pickle", 'wb') as handle:
    pickle.dump(response_to_summaries, handle, protocol=pickle.HIGHEST_PROTOCOL)