# OPERATIONALIZE FRAMEWORK

## 1. Imports

In [64]:
import string

from files_to_database import main_to_database
from functools import partial
from operator import itemgetter

import chromadb
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.memory import ConversationBufferMemory
from langchain.storage import create_kv_docstore, LocalFileStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint
from langchain_core.messages import get_buffer_string
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFaceHub
from langchain_core.prompts import ChatPromptTemplate, format_document, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from getpass import getpass
import os
import pandas as pd
from langchain_core.runnables import RunnableParallel
from langchain.retrievers import ParentDocumentRetriever
from tqdm import tqdm

from transformers import pipeline

## 2. Embedding models (local)

In [2]:
# # For Apple Silicon users: run the following code to make use of MPS (Apple's Metal Performance Shaders) for faster computation
# import torch
# 
# # set device to MPS
# device = torch.device("mps")
# 
# # empty cache and set memory fraction
# torch.mps.empty_cache()
# torch.mps.set_per_process_memory_fraction(0.9)
# 
# # choose embeddings model
# multilingual = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# 
# # local embedding model, download to cache folder
# embedding_model = SentenceTransformer(multilingual, cache_folder="../Data/sentence_transformers", device=device)
# 
# embeddings_retrieve = HuggingFaceEmbeddings(model_name=multilingual, cache_folder="../Data/sentence_transformers")
# 
# # move model to MPS
# embedding_model.to(device)

In [3]:
# local embedding model, download to cache folder
multilingual = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

# model used for document embedding
embedding_model = SentenceTransformer(
    model_name_or_path=multilingual, 
    cache_folder="../Data/sentence_transformers"
)

In [4]:
# model used for query embedding
embeddings_retrieve = HuggingFaceEmbeddings(
    model_name=multilingual,
    cache_folder="../Data/sentence_transformers"
)

## 3. Chroma client setup

In [5]:
# initiate the chroma client, which is the interface to the database
database_path = "../Data/my_vectordb"
chroma_client = chromadb.PersistentClient(path=database_path)

In [6]:
# print the collections
chroma_client.list_collections()

[Collection(name=rijksoverheid),
 Collection(name=ibestuur),
 Collection(name=tenderned),
 Collection(name=binnenlands_bestuur)]

## 4. Retrievers for all databases

4.1 Parent / child splitters

In [7]:
# define the retrievers, parent and child splitters, MAKE SURE TO CHANGE ALSO IN files_to_database.py
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                                 chunk_overlap=0)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=128,
                                                chunk_overlap=0)

4.2 Rijksoverheid retriever

In [8]:
# rijksoverheid database
rijksoverheid_db= Chroma(
    collection_name="rijksoverheid",
    client=chroma_client,
    persist_directory="../Data/my_vectordb",
    embedding_function=embeddings_retrieve,
)

In [9]:
columns_to_embed = ["content"]
columns_to_metadata = ["id", "type", "title", "canonical", "introduction", "lastmodified", "available", "initialdate"]

# rijksoverheid retriever
full_path = os.path.abspath("../Data/my_vectordb/full_documents/rijksoverheid")

fs = LocalFileStore(full_path)
store = create_kv_docstore(fs)

rijksoverheid_db_retriever = ParentDocumentRetriever(
    vectorstore=rijksoverheid_db,
    docstore=store,
    child_splitter=child_splitter,
    child_metadata_field=columns_to_metadata,
    parent_splitter=parent_splitter,
    )

4.3 ibestuur retriever

In [10]:
# ibestuur database
ibestuur_db= Chroma(
    collection_name="ibestuur",
    client=chroma_client,
    persist_directory="../Data/my_vectordb",
    embedding_function=embeddings_retrieve,
)

In [11]:
# ibestuur retriever
ibestuur_retriever = ibestuur_db.as_retriever(
    search_kwargs={"k": 1}
)

4.4 binnenlandsbestuur retriever

In [12]:
# binnenlandsbestuur database
binnenlandsbestuur_db= Chroma(
    collection_name="binnenlands_bestuur",
    client=chroma_client,
    persist_directory="../Data/my_vectordb",
    embedding_function=embeddings_retrieve,
)

In [13]:
# binnenlandsbestuur retriever
binnenlandsbestuur_retriever = binnenlandsbestuur_db.as_retriever(
    search_kwargs={"k": 1}
)

4.5 tenderned retriever

In [14]:
# tenderned database

## 5. LLM from Inference Endpoints API

In [15]:
# Make sure to replace these values with your personal API URL and KEY
# API_URL = "https://oi6h8u843v8nt5qt.eu-west-1.aws.endpoints.huggingface.cloud"
# API_KEY = getpass("Enter your API KEY:")

In [16]:
# LLM model (Hugging Face)
# llm = HuggingFaceEndpoint(
#     endpoint_url=API_URL,
#     huggingfacehub_api_token=API_KEY,
#     temperature=0.1,
#     max_new_tokens=2048,
#     model_kwargs={"max_input_length": 2048, "max_length": 2048, "max_num_tokens": 2048}
# )

In [17]:
HF_token = getpass("Enter your Hugging Face API Token:")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_token
llm = HuggingFaceHub(
    huggingfacehub_api_token=HF_token,
    repo_id="HuggingFaceH4/zephyr-7b-alpha",
    model_kwargs={"temperature":0.5, "max_new_tokens":512, "max_length":64}
)
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

# llm = HuggingFacePipeline.from_model_id(
#     model_id="gpt2",
#     task="text-generation",
#     pipeline_kwargs={"max_new_tokens": 128}
# )

  warn_deprecated(


## 7. Load Framework

In [18]:
# prevent reading extra unnamed column
framework = pd.read_csv("../Results/framework_questions_translated.csv", usecols=[' #', '2022 GTMI Indicators & Sub-indicators NL', 'Response options & Data format NL'])

In [19]:
framework = framework.head()

In [20]:
framework

Unnamed: 0,#,2022 GTMI Indicators & Sub-indicators NL,Response options & Data format NL
0,I-1,Is er een gedeeld cloud platform beschikbaar v...,"0= Nee, 1= Alleen cloud strategie/beleid (nog ..."
1,I-1.1,Naam van het Overheids Cloud platform,Tekst
2,I-1.2,Cloud platform / strategie URL,URL
3,I-1.3,Overheids Cloud gelanceerd / zal worden gelanc...,YYYY
4,I-1.4,Type beschikbaar cloud platform,"0= Onbekend, 1= Publiek (Commercieel), 2= Priv..."


## 6. Define functions to operationalize framework

6.1 Generate prompt

In [61]:
# # chat prompt template
template = """
Je bent 'GovTech-GPT', een geavanceerde AI-assistent met uitgebreide expertise in digitale technologieën specifiek gericht op toepassingen binnen de Nederlandse overheid. Je belangrijkste taak is het ondersteunen bij het operationaliseren van e-gov benchmarking frameworks. Je antwoordt altijd op basis van de meest recente gegevens en inzichten, en houdt rekening met de specifieke context van de Nederlandse overheid. Antwoorden geef je alleen volgens het gespecificeerde dataformat, waarbij je, indien mogelijk, het cijfer gebruikt en niet de tekst. Voeg verder geen enkele tekst, toelichting of uitleg meer toe. Als je het antwoord niet weet, geef je geen fictieve informatie of uitleg, maar antwoord enkel en alleen met: 'Geen antwoord.'  \n\n"

CONTEXT: {context}

DATA FORMAT: {data_format}

VRAAG: {question}

ANTWOORD: 
"""

In [22]:
# chat prompt template
prompt = ChatPromptTemplate.from_template(template)

In [23]:
# output parser
output_parser = StrOutputParser()

6.2 Format context

In [24]:
# format the context for input
def format_context(context): 
    context_string = ""
    
    for i in range(len(context["context_ibestuur"])):
        context_string += f"{dict(context['context_ibestuur'][i])['page_content']}\n"
    for i in range(len(context["context_rijksoverheid"])):
        context_string += f"{dict(context['context_rijksoverheid'][i])['page_content']}\n"

    return context_string

6.3 Retrieval setup

6.4 Chain setup

### with context

In [79]:
question = framework['2022 GTMI Indicators & Sub-indicators NL'].iloc[0]
data_format1 = framework['Response options & Data format NL'].iloc[0]

In [83]:
# fill in the template
class FormatDict(dict):
    def __missing__(self, key):
        return "{" + key + "}"

In [89]:
def return_prompt(template, data_format):
    formatter = string.Formatter()
    mapping = FormatDict(data_format=data_format)
    prompt_string = formatter.vformat(template, (), mapping)
    return ChatPromptTemplate.from_template(prompt_string)
    

def format_docs(docs):
    content = "\n\n ------------".join(doc.page_content for doc in docs)
    urls = [doc.metadata["canonical"] for doc in docs]
    return content

def retrieve_answer(output):
    return output.content

In [None]:
prompt = return_prompt(template, data_format1)

In [98]:
# Variation 
rag_chain = ( 
            RunnableParallel(context = rijksoverheid_db_retriever | format_docs, 
                             question = RunnablePassthrough() ) |
            RunnableParallel(answer= prompt | llm | retrieve_answer,  question = itemgetter("question"),  context = itemgetter("context") ) 
)

In [99]:
from langchain.globals import set_debug

set_debug(True)

rag_chain.invoke("When was SVM invented?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "When was SVM invented?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,question>] Entering Chain run with input:
[0m{
  "input": "When was SVM invented?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,question> > 3:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "When was SVM invented?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,question> > 3:chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "When was SVM invented?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,question> > 4:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "When was SVM invented?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain

HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-alpha (Request ID: mlGEdV62qSGawZ-T8v0jv)

Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate

### test 4

In [100]:
from langchain.chains import RetrievalQA

# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=rijksoverheid_db_retriever, 
    return_source_documents=True)
     

## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [103]:
print(qa_chain.combine_documents_chain.llm_chain.prompt)

input_variables=['context', 'question'] template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"


In [101]:
# full example
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

  warn_deprecated(


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "How much money did Pando raise?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "How much money did Pando raise?",
  "context": ".000 Generiek Kantoor  & Toezicht Het opleveren van de generieke voorzieningen die  benodigd zijn voor de implementatie van wetgeving  welke zich richt op het verbod voor handelaren om  transacties boven €3.000 in contanten te verrichten. Q1 2023 t/m   Q4 2024 Uitbreiding artikel  19 Invorderingswet Inning &  Betalingsverkeer Verbeteren en verder automatiseren koppelingen met  banken voor beslagleggen op banktegoeden, uitvloeisel  uitbreiding art 19 invorderingswet ntb Wetgeving (III/III) 3 Geprioriteerde  projecten 

HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-alpha (Request ID: B4hYM-yM0udUYnHoNsCrd)

Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate

## 8. Operationalize framework functions

### Test

In [None]:
question = framework['2022 GTMI Indicators & Sub-indicators NL'].iloc[0]
data_format = framework['Response options & Data format NL'].iloc[0]

### without context:

In [None]:
from langchain.globals import set_verbose, set_debug

set_debug(True)
response = simple_chain.invoke({"data_format":data_format, "question":question})

### with context:

### Test 3

# definitive

In [None]:
def get_main_indicator(index):
    if '.' in index:
        return index.split('.')[0]
    return index

In [None]:
def generate_question(indicator, sub_indicator):
	if sub_indicator:
		question = f"{indicator}, indien ja, wat is de {sub_indicator} ?"
	else:
		question = f"{indicator} ?"
	return question

In [None]:
def operationalize_framework(framework):
    framework_operationalized = framework.copy()
    framework_operationalized["Operationalisatie"] = None
    framework_operationalized['Prompt'] = None
    framework_operationalized[' #'] = framework_operationalized[' #'].str.replace('I-', '')

    for index, row in tqdm(framework_operationalized.iterrows(), total=framework_operationalized.shape[0]):
        idx = row[' #']
        main_indicator_idx = get_main_indicator(idx)
        sub_indicator_idx = idx.split('.')[1] if '.' in idx else None
        indicator_info = row['2022 GTMI Indicators & Sub-indicators NL']
        data_format = row['Response options & Data format NL']
                
        if sub_indicator_idx:
            # look up the main indicator
            main_indicator = framework_operationalized.loc[framework_operationalized[' #'] == main_indicator_idx]['2022 GTMI Indicators & Sub-indicators NL'].iloc[0]
            question = generate_question(main_indicator, indicator_info)
        else:
            question = generate_question(indicator_info, None)    
        
        output = rag_chain_with_source.invoke(question, {"data_format": data_format})
        
        framework_operationalized.loc[framework_operationalized[' #'] == idx, 'Operationalisatie'] = output
        framework_operationalized.loc[framework_operationalized[' #'] == idx, 'Operationalisatie'] = output["answer"].split("Answer: ")[-1].strip()
        framework_operationalized.loc[framework_operationalized[' #'] == idx, 'Prompt'] = output["answer"].split("Answer: ")[0].strip()
        framework_operationalized.loc[framework_operationalized[' #'] == idx, 'Context'] = output["context"]
        
    return framework_operationalized

## 9. RUN THIS AWeSOME OPERATiONALIZER

In [None]:
from langchain.globals import set_debug

set_debug(True)

In [None]:
df = operationalize_framework(framework)

In [None]:
df

In [None]:
for index, row in df.iterrows():
	print(f"PROMPT: {row['Prompt']}")
	print(f"{row['Operationalisatie']}")
	print("\n")

In [None]:
# questions list to ask the model
questions_list = [
    "What is the role of the government in the Netherlands?",
    "What is the role of AI in the Netherlands?",
    "How many AI startups are there in the Netherlands?"
]

In [None]:
# fill the dataframe with answers and context
def fill_framework(questions):
    rows = []
    for question in questions:
        response = rag_chain_with_source.invoke(question)
        row = {
                "Question": question,
                "Context": response["context"],
                "Answer": response["answer"].split("Answer: ")[-1].strip()
            },
        rows.append(row)
        
    dataframe = pd.DataFrame(
        [item for sublist in rows for item in sublist]
    )
    return dataframe

In [None]:
df = fill_framework(questions_list)

In [None]:
df

In [None]:
# print context
print(df["Context"][0])

# NOTES

### test retrieval parent/child

In [None]:
sub_docs = rijksoverheid_db.similarity_search("Informatiebeveiliging")
print("Child Splits:\n\n", sub_docs[0], "\n\n", sub_docs[1])

In [None]:
retrieved_docs = rijksoverheid_db_retriever.invoke("Informatiebeveiliging")
print("Parent Splits:\n\n", retrieved_docs[1], "\n\n", retrieved_docs[1])

### CHAIN for single retriever test

In [None]:
# define the chain
chain = (
    {'context': rijksoverheid_db_retriever, 'query': RunnablePassthrough()}
    | prompt
    | llm
    | output_parser
)

In [None]:
# print the chain
chain.get_graph().print_ascii()

In [None]:
query = "What is the definition of Artificial Intelligence"

response = chain.invoke(query)

In [None]:
print(response)

### Template

In [None]:
# chat prompt template
prompt_str = """Answer the question below using the context:

Context:
{context}

Question: {question}

Answer: """

# chat prompt
prompt = ChatPromptTemplate.from_template(prompt_str)

## Local LLM (for testing)

In [None]:
# output parser
retrieval = RunnableParallel(
    {
        "context_ibestuur": ibestuur_retriever, 
        "context_rijksoverheid": rijksoverheid_db_retriever, 
        "context_binnenlandsbestuur": binnenlandsbestuur_retriever,
        "question": RunnablePassthrough()
    }
)

In [None]:
# Chain to generate answer
chain_from_docs = (
        RunnablePassthrough.assign(context=(lambda x: format_context(x["context"])))
        | prompt 
        | llm 
        | output_parser
)

# Chain to include used sources + answer
rag_chain_with_source = RunnableParallel(
    {"context": retrieval, 
     "question": RunnablePassthrough(), 
     "data_format": RunnablePassthrough()}
).assign(answer=chain_from_docs)