In [1]:
# imports 
from dotenv import load_dotenv
import streamlit as st
import os

from langchain_community.vectorstores import VectorStore, FAISS, Chroma
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, PyPDFDirectoryLoader
from langchain_community.llms import HuggingFaceEndpoint

from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import GPT4AllEmbeddings

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains import QAGenerationChain, ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.memory import ConversationBufferMemory
from langchain.schema.runnable import RunnablePassthrough

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.output_parsers import StrOutputParser



In [9]:
# retriever - llm
def get_retriever(loader):
    # get documents from data folder
    documents = loader.load()

    # split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
    texts = text_splitter.split_documents(documents)

    # create retriever
    embeddings = GPT4AllEmbeddings()
    vector_store = FAISS.from_documents(texts, embeddings)
    retriever = vector_store.as_retriever()
    return retriever

loader = PyPDFDirectoryLoader('data')
retriever = get_retriever(loader)



Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\moolhuijsenns\.cache\huggingface\token
Login successful


In [3]:
from langchain_core.output_parsers import ListOutputParser, CommaSeparatedListOutputParser, JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from operator import itemgetter
from typing import List
from langchain_core.runnables import RunnableLambda, RunnableParallel

report = {}
def get_report():
    return report

# statement object = dict of what, when, where, who, how
class Statement(BaseModel):
    statement: str = Field(description="The whole statement")
    what: List[str] = Field(description="What happened")
    when: List[str] = Field(description="Date and time of the incident")
    where: List[str] = Field(description="Where it happened")
    who: List[str] = Field(description="Who was involved")
    how: List[str] = Field(description="How it happened")

# extract information into the w-categories
parser = JsonOutputParser(pydantic_object=Statement)
prompt1 = PromptTemplate(
    template= """ Create a categorized statement following the format: {formatting_instructions}.
    'statement' contains the whole statement {statement}. All information should be categorized into 'what', 'when', 'where', 'who', and 'how'.
    Add all details to the categories. Categories can be empty and they may contain overlapping information.""",
    input_variables=["statement"],
    partial_variables={"formatting_instructions": parser.get_format_instructions()},
)

load_dotenv()
llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2")

# returns statement object  
summarizer = prompt1 | llm | parser 
partial = """
On the night of March 18th, 2024, around 8:30 PM, I saw a guy rob a girl outside the convenience store on Elm Street. He was tall, kinda skinny,
 looked like he was in his 30s, wearing a black hoodie and jeans with a backpack. The girl seemed young, maybe in her early 20s, wearing a red 
 jacket and jeans with long blonde hair. So, this guy walks up to her, pulls out what looked like a knife, and tells her to hand over her purse. 
 She looked scared, so she gave it to him, and he took off running down Elm Street.
"""
summarizer.invoke({"statement": partial})




Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\moolhuijsenns\.cache\huggingface\token
Login successful


{'statement': 'On the night of March 18th, 2024, around 8:30 PM, I saw a guy rob a girl outside the convenience store on Elm Street. He was tall, kinda skinny, looked like he was in his 30s, wearing a black hoodie and jeans with a backpack. The girl seemed young, maybe in her early 20s, wearing a red jacket and jeans with long blonde hair. So, this guy walks up to her, pulls out what looked like a knife, and tells her to hand over her purse. She looked scared, so she gave it to him, and he took off running down Elm Street.',
 'what': ['A guy robbed a girl',
  'The guy pulled out a knife',
  'The girl gave the guy her purse'],
 'when': ['March 18th, 2024', 'around 8:30 PM'],
 'where': ['outside the convenience store', 'Elm Street'],
 'who': ['a guy', 'the girl'],
 'how': ['the guy pulled out a knife and told the girl to hand over her purse']}

In [None]:
# asks questions for missing categories
# prompt = ChatPromptTemplate.from_template(""" 
# You are an interviewer for crime investigations. A witness has given a statement {statement}, if it is incomplete you will
# ask the witness one question for each missing category: {missing_categories}.

# If it is complete, say 'Thank you for your response'.       
# """)


# bot_response = full_chain.invoke({"statement": user_response})
# if "Thank you for your response." in bot_response:
#     all_parts = True
# return bot_response, all_parts, 
# print(answer1)

# details_dict = {
#     "what": itemgetter("what"),
#     "when": itemgetter("when"),
#     "where": itemgetter("where"),
#     "who": itemgetter("who"),
#     "how": itemgetter("how"),
# }
runnable = {
    "missing": lambda x: check_lists(x)
}
# check if all categories are filled
def check_lists(details_dict):
    missing = []
    for key, value in details_dict.items():
        if key != "statement":
            if not value:
                missing.append(key)
    if missing:
        print(missing)
        return "the missing categories are: " + ", ".join(missing) 
    else:
        return "All categories are filled."
combine_prompt = PromptTemplate(
    template="""You will be given two categorized statements {categorized_statement} and {report}. Combine these into one complete statement.
Formatting Instructions: {formatting_instructions}.""",
    input_variables=["categorized_statement", "report"],
    partial_variables={"formatting_instructions": parser.get_format_instructions()},                                         
)

combined_statement = (
    {"categorized_statement": summarizer, "report": lambda x: get_report()}
    | combine_prompt
    | llm
    | parser
)
full_chain = (
    summarizer
    | {"missing_categories": runnable, "statement": itemgetter("statement")}
    | combine_prompt
    | llm)

In [None]:
loader = PyPDFLoader('Information.pdf')
information = get_retriever(loader)
template = "create a follow up question for the statement {statement}. Take the chat history {chat_history} into account."

def get_whole_conv(user_response, chat_history):
        concatenated_values = ''.join([value for d in chat_history for value in d.values()])
        return concatenated_values + user_response

chain_memory =( 
    {   "statement": itemgetter("statement"), 
        "chat_history": itemgetter("chat_history"),
        "information": RunnableLambda(get_whole_conv(itemgetter("statement"), lambda x: x["chat_history"])) | information,}
| ChatPromptTemplate.from_template(template) #"context": retriever, 
| llm
| StrOutputParser())

# response = chain.invoke(user_response)
chain_memory.invoke({"statement": "hey my name is Peter", "chat_history": []})#{"statement": user_response, "chat_history": chat_history}

Mendez check

In [3]:
loader = PyPDFDirectoryLoader('data')
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
texts = text_splitter.split_documents(documents)

embeddings = GPT4AllEmbeddings()
vector_store = FAISS.from_documents(texts, embeddings)
retriever = vector_store.as_retriever()


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\moolhuijsenns\.cache\huggingface\token
Login successful


In [60]:
llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2")
# llm = HuggingFaceEndpoint(repo_id="google/gemma-7b")

template = """Give an answer to a question regarding the PEACE investigative interview approach.
Question: {question}
Answer: 
"""
prompt = PromptTemplate(
    input_variables=["question"],
    template=template
)
chain = (
    { "context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

chain.invoke("What is the struncture of PEACE?")


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\moolhuijsenns\.cache\huggingface\token
Login successful


'PEACE is an acronym for the following elements of the investigative interviewing approach:\n1. Planning and Preparation: This stage involves gathering all relevant information about the case, the interviewee, and the interview environment to increase the chances of a successful interview.\n2. Engage and Explore: This stage is about building rapport and trust with the interviewee to create a safe and open environment for the interview. It involves active listening, empathy, and open-ended questions.\n3. Account: This stage is about obtaining a detailed and accurate account of the incident from the interviewee. It involves asking clear, direct, and specific questions to clarify any inconsistencies or gaps in the information provided.\n4. Closure: This stage is about bringing the interview to a close in a respectful and appropriate manner. It involves summarizing the key points of the interview, providing feedback, and ensuring that the interviewee understands the next steps in the inves

In [47]:
from langchain.chains import LLMChain, ConstitutionalChain
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
plain_language = ConstitutionalPrinciple(
    name="Plain Language",
    critique_request="""Use clear and straightforward language that is easy for the witness to understand. Avoid technical jargon, complex terminology, 
or legalistic language that may be confusing or intimidating.""",
    revision_request="Rewrite the model's response in plain language",
)
avoid_ambiguity = ConstitutionalPrinciple(
    name="Avoid Ambiguity",
    critique_request="""Be precise and specific in your communication to minimize misunderstandings. Avoid ambiguous phrases or vague language that 
could be interpreted in different ways.""",
    revision_request="Rewrite the model's response to avoid ambiguity",
)
elaboration = ConstitutionalPrinciple(
    name="Elaboration",
    critique_request="""Encourage the witness to provide detailed accounts by asking open-ended questions that invite them to elaborate on their experiences, 
thoughts, and feelings. Avoid interrupting or rushing the witness's responses.""",
    revision_request= "Rewrite the model's response to prompt further elaboration",
)
neutral_phrasing = ConstitutionalPrinciple(
    name="Neutral phrasing",
    critique_request="""Frame questions in a neutral and non-leading manner to avoid influencing the witness's responses.""",
    revision_request="Rewrite the model's response using neutral phrasing",
)
sequential_recall = ConstitutionalPrinciple(
    name="Sequential recall",
    critique_request="""Guide witnesses through a chronological sequence of events to ensure a comprehensive account.""",
    revision_request="Rewrite the model's response to prompt sequential recall",
)
specificity = ConstitutionalPrinciple(
    name="Specificity",
    critique_request="""Seek specific details and descriptions from the witness to enhance the accuracy and completeness of their testimony.""",
    revision_request="Rewrite the model's response to prompt for specific details",
)
clarification = ConstitutionalPrinciple(
    name="Clarification",
    critique_request="""If a witness's response is unclear or ambiguous, use probing questions to seek clarification and encourage further elaboration.""",
    revision_request="Rewrite the model's response to seek clarification",
)
avoid_compound = ConstitutionalPrinciple(
    name="Avoid compound",
    critique_request="""Avoid combining multiple questions into one, as it can confuse the witness and lead to incomplete or 
    inaccurate responses.""",
    revision_request="Rewrite the model's response to avoid compound questions",
)
empathetic = ConstitutionalPrinciple(
    name="Empathetic",
    critique_request="""Show empathy and sensitivity when asking about emotionally charged or traumatic experiences, allowing witnesses to share their 
feelings and perspectives.""",
    revision_request= "Rewrite the model's response to show empathy",
)

haiku = ConstitutionalPrinciple(
    name="haiku",
    critique_request="""The model should repond with a haiku""",
    revision_request="If the output is not a haiku, rewrite the model's response to be a haiku",
)
one_question = ConstitutionalPrinciple(
    name="One Question",
    critique_request="Ask only one question at a time to avoid overwhelming the witness with multiple inquiries.",
    revision_request="Rewrite the model's response to ask only one question. The output should contain a single question.",
)
Mendez = ConstitutionalPrinciple(
    name="Mendez Principles",
    critique_request="Follow the Mendez Principles for investigative interviewing.",
    revision_request="Rewrite the model's response to follow the Mendez Principles",
)



In [58]:

template = """
You are an interviewer for crime investigations.
You are interviewing a witness that has given the following statement:
{statement}

Given the statement and the chat history {chat_history}, ask a follow-up question to prompt further elaboration from the witness.
Your question must follow the Mendez Principles for investigative interviewing.
Ask one question. Do not provide anything else.
Answer:
""" 
qa_prompt = PromptTemplate(
    template=template,
    input_variables=["statement", "chat_history"],
)
qa_chain = LLMChain(llm=llm, prompt=qa_prompt)

constitutional_chain = ConstitutionalChain.from_llm(
    llm=llm,
    chain=qa_chain,
    constitutional_principles=[plain_language, avoid_ambiguity, elaboration],
)

constitutional_chain.invoke({"statement": "It was today at 9 at night.", "chat_history": "I saw a robbery. When was it?"})

{'statement': 'It was today at 9 at night.',
 'chat_history': 'I saw a robbery. When was it?',
 'output': 'Can you please provide a more detailed description of the events you observed at 9 PM tonight? What specifically did you see or hear that made you believe a robbery had occurred?'}

In [None]:
# old response

class Interviewer:
    def __init__(self):
        self.llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2")
        self.chat_history = []

    def get_retriever(self, loader):
        # get documents from data folder
        documents = loader.load()

        # split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
        texts = text_splitter.split_documents(documents)

        # create retriever
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        vector_store = FAISS.from_documents(texts, embeddings)
        retriever = vector_store.as_retriever()
        return retriever
    
    
    def get_response(self, user_response, chat_history):
        retriever = self.get_retriever(PyPDFDirectoryLoader('data'))
        information = self.get_retriever(PyPDFLoader('information.pdf'))

    def get_response(self, user_response, chat_history):
        retriever = self.get_retriever(PyPDFDirectoryLoader('data'))
        information = self.get_retriever(PyPDFLoader('information.pdf'))

        # chain with memory
        chain_memory =( 
            {
                "statement": itemgetter("statement"), 
                "chat_history": itemgetter("chat_history"),
                "information": {"statement": itemgetter("statement"), "chat_history": itemgetter("chat_history")}
                | RunnableLambda(get_whole_conv) | information
            }
        | ChatPromptTemplate.from_template(template) #"context": retriever, 
        | self.llm
        | StrOutputParser())

        response1 = chain_memory.invoke({"statement": user_response, "chat_history": chat_history})

        template_mendez = """
        Your task is to rewrite a question so that it satisfies the Mendez principles.
        The input question to rewrite is: {input}.
        The output should only be the revised question. No other information should be included.
        Output:
        """
        qa_prompt = PromptTemplate(
            template=template_mendez,
            input_variables=["input"],
            )

        qa_chain = LLMChain(llm=self.llm, prompt=qa_prompt)

        constitutional_chain = ConstitutionalChain.from_llm(
            llm=self.llm,
            chain=qa_chain,
            constitutional_principles=[avoid_ambiguity, elaboration, plain_language],
        )

        response2 = constitutional_chain.invoke({"input": response1})
        print(f"response1: {response1} \n response2: {response2}")
        
        return response2
    
def get_whole_conv(dict):
    concatenated_values = ''
    chat_history = dict["chat_history"]
    for message in chat_history:
        concatenated_values.join(message["content"])
    return concatenated_values + dict["statement"]