In [2]:
from dotenv import load_dotenv
import streamlit as st
import os

from langchain_community.vectorstores import VectorStore, FAISS, Chroma
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, PyPDFDirectoryLoader
from langchain_community.llms import HuggingFaceEndpoint

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains import QAGenerationChain, ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.memory import ConversationBufferMemory
from langchain.schema.runnable import RunnablePassthrough

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.output_parsers import StrOutputParser



In [5]:
# template
template = """
You are an interviewer for crime investigations. You are interviewing a witness to a crime regarding what they 
have witnessed. Which questions and the way you will ask these will be according to the following principles:
{context}

First you will give a brief overview of how the interview will go. Then you will ask the witness for their personal information:
1. What is your full name, date of birth, and place of birth?
2. What is your current address and city of residence?
3. What is your pasport number / ID number?

Only continue to the next question when a full answer in given to the previous question.

Before starting the interview, give the witness the instructions as specifified in {context}. Afterwards,
ask the witness to describe the events they witnessed:
1. What did you see and hear?
2. Where and when was this?
3. Ask the witness to describe the people involved.

For each question, the witness will respond with a response {input}. Ask the witness to elaborate if the response is not detailed enough.
At every stage, check for inconsistencies in the witness's story and ask for clarification if needed.
Use {chat_history} to keep track of the conversation.

"""

In [3]:
class Interviewer:
    def __init__(self):
        self.retriever = None

    def set_retriever(self):
        # get documents from data folder
        loader = PyPDFDirectoryLoader('data')
        documents = loader.load()

        # split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
        texts = text_splitter.split_documents(documents)

        # create retriever
        embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
        vector_store = FAISS.from_documents(texts, embeddings)
        retriever = vector_store.as_retriever()
        self.retriever = retriever

    def get_conversation_chain(self):
        llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2") # choose llm from huggingface
        prompt_template = ChatPromptTemplate.from_template(template) # create prompt template
        # create conversation chain with memory
        # memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)	
        conversation_chain = create_history_aware_retriever(
            llm=llm,
            retriever=self.retriever,
            prompt=prompt_template
        )
        return conversation_chain


In [6]:
bot = Interviewer()
bot.set_retriever()
chain = bot.get_conversation_chain()

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\moolhuijsenns\.cache\huggingface\token
Login successful


In [8]:
chain.invoke({"input": "Hello, I am a witness to a crime and I would like to report what I have seen."})

[Document(page_content='obtain and review as much evidence or information as is available – such as witness and \nvictim statements already taken, prior statements by a suspect, forensic reports, physical \nevidence, and electronic images and information. Interviewers should assess the relevance \nand reliability of the available information and identify information gaps that need to be', metadata={'source': 'data\\Mendez-Principles.pdf', 'page': 32}),
 Document(page_content='have the impression that their concern has not been met with the appropriate response.\n184. Anyone reporting a violation, such as a ‘whistle-blower’, should be provided adequate \nprotection from any form of reprisals or negative treatment.\n185. Other criminal justice professionals such as lawyers, prosecutors and judges who see, hear', metadata={'source': 'data\\Mendez-Principles.pdf', 'page': 47}),
 Document(page_content='• Have you covered the points needed to prove the \noffence(s) in question?\nNew Informat