In [None]:
!pip install python-dotenv
!pip install llama-index transformers torch accelerate
!pip install llama-index-llms-huggingface
!pip install chromadb
!pip install python-dotenv

In [97]:
import os
from dotenv import load_dotenv
load_dotenv()
monday_api_token = os.getenv('MONDAY_API_KEY')

## Set up local embedding LLM

In [None]:
%pip install llama-index-embeddings-huggingface
%pip install llama-index-embeddings-instructor
%pip install sentence-transformers

In [101]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

## Web reader

In [None]:
!pip install llama-index-readers-web

In [None]:
from llama_index.readers.web import WholeSiteReader

# Initialize the scraper with a prefix URL and maximum depth
scraper = WholeSiteReader(
    prefix="https://www.highspeedtraining.co.uk", max_depth=10
)

# Start scraping from a base URL
documents = scraper.load_data(
    base_url="https://www.highspeedtraining.co.uk/hub/"
)  # Example base URL

## Monday reader

In [94]:
# Custom Monday API reader

from typing import Dict, List
import requests
import json
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document

class MondayReader(BaseReader):
    """monday.com reader. Reads board's data by a GraphQL query."""
    def __init__(self, api_key: str) -> None:
        """Initialize monday.com reader."""
        self.api_key = api_key
        self.api_url = "https://api.monday.com/v2"

    def _parse_item_values(self, cv) -> Dict[str, str]:
        return {"title": cv.get("column", {}).get("title", ""), "value": cv.get("text", "")}

    def _parse_assets(self, assets) -> List[Dict[str, str]]:
        return [{"name": asset.get("name", ""), "url": asset.get("url", "")} for asset in assets]

    def _parse_updates(self, updates) -> List[Dict[str, str]]:
        return [{"body": update.get("body", ""), "created_at": update.get("created_at", "")} for update in updates]

    def _parse_data(self, item) -> Dict[str, any]:
        return {
            "id": item["id"],
            "name": item["name"],
            "values": [self._parse_item_values(cv) for cv in item["column_values"]],
            "assets": self._parse_assets(item.get("assets", [])),
            "updates": self._parse_updates(item.get("updates", [])),
            "subitems": [self._parse_data(subitem) for subitem in item.get("subitems", [])]
        }

    def _perform_request(self, board_id: int, cursor: str = None) -> Dict[str, any]:
        headers = {"Authorization": self.api_key}
        query = """
            query($boardId: ID!, $cursor: String) {
                boards(ids: [$boardId]) {
                    name,
                    items_page(limit: 100, cursor: $cursor) {
                        cursor
                        items {
                            id,
                            name,
                            column_values {
                                column { title }
                                text
                            }
                            assets {
                                name
                                url
                            }
                            updates {
                                body
                                created_at
                            }
                            subitems {
                                id,
                                name,
                                column_values {
                                    column { title }
                                    text
                                }
                                assets {
                                    name
                                    url
                                }
                                updates {
                                    body
                                    created_at
                                }
                            }
                        }
                    }
                }
            }
        """
        variables = {"boardId": board_id, "cursor": cursor}
        data = {"query": query, "variables": variables}
        response = requests.post(url=self.api_url, json=data, headers=headers)
        return response.json()

    def _subitem_to_dict(self, subitem):
        return {
            "id": subitem["id"],
            "name": subitem["name"],
            "values": [{"title": v["title"], "value": v["value"]} for v in subitem["values"]],
            "assets": [{"name": a["name"], "url": a["url"]} for a in subitem["assets"]],
            "updates": [{"body": u["body"], "created_at": u["created_at"]} for u in subitem["updates"]]
        }

    def load_data(self, board_id: int) -> List[Document]:
        """Load board data by board_id."""
        all_items = []
        cursor = None
        while True:
            json_response = self._perform_request(board_id, cursor)
            
            if "errors" in json_response:
                raise Exception(f"API Error: {json_response['errors']}")
            
            if "data" not in json_response or "boards" not in json_response["data"]:
                raise Exception("Unexpected API response structure")
            board_data = json_response["data"]["boards"][0]
            items_page = board_data["items_page"]
            
            all_items.extend(items_page["items"])
            
            if not items_page["cursor"]:
                break
            
            cursor = items_page["cursor"]
        parsed_items = [self._parse_data(item) for item in all_items]
        
        result = []
        for item in parsed_items:
            text = f"name: {item['name']}"
            for item_value in item["values"]:
                if item_value["value"]:
                    text += f", {item_value['title']}: {item_value['value']}"
            
            if item["assets"]:
                text += "\nAssets:"
                for asset in item["assets"]:
                    text += f"\n- {asset['name']}: {asset['url']}"
            
            if item["updates"]:
                text += "\nUpdates:"
                for update in item["updates"]:
                    text += f"\n- {update['created_at']}: {update['body']}"
            
            if item["subitems"]:
                text += "\nSubitems:"
                for subitem in item["subitems"]:
                    text += f"\n- {subitem['name']}"
                    for subitem_value in subitem["values"]:
                        if subitem_value["value"]:
                            text += f", {subitem_value['title']}: {subitem_value['value']}"

            result.append(
                Document(
                    text=text,
                    extra_info={
                        "board_id": board_id,
                        "item_id": item["id"],
                        "subitems_count": len(item["subitems"]),
                        "subitems_json": json.dumps([self._subitem_to_dict(subitem) for subitem in item["subitems"]])
                    }
                )
            )
        
        return result

In [98]:
# Monday API connection
board_id = 1564566045
reader = MondayReader(monday_api_token)
raw_response = reader._perform_request(board_id)
documents = reader.load_data(board_id)

## Local vector store

In [None]:
!pip install llama-index-llms-openai

In [23]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'False'

In [20]:
# Setup database
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("site")

In [28]:
# Create index 
from llama_index.llms.openai import OpenAI
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Set up OpenAI LLM
llm = OpenAI(model="gpt-3.5-turbo")

index = VectorStoreIndex.from_documents(
    documents,
    embed_model=embed_model,
    storage_context=storage_context,
    llm=llm
)

## Supabase vector store

In [None]:
!pip install llama-index-vector-stores-supabase

In [104]:
from llama_index.vector_stores.supabase import SupabaseVectorStore

# Substitute your connection string here
DB_CONNECTION = os.getenv('DB_CONNECTION')

vector_store = SupabaseVectorStore(
    postgres_connection_string=DB_CONNECTION, 
    collection_name='knowledgebase'
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Build index from documents

index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

## Select collection

In [156]:
DB_CONNECTION = os.getenv('DB_CONNECTION')

vector_store = SupabaseVectorStore(
    postgres_connection_string=DB_CONNECTION, 
    collection_name='knowledgebase'
)

index = VectorStoreIndex.from_vector_store(vector_store)

**********
Trace: index_construction
**********


## Basic query engine

In [151]:
query_engine = index.as_query_engine()

In [158]:
response = query_engine.query("What is HAACP?")
print(response)



**********
Trace: query
    |_CBEventType.QUERY -> 1.495382 seconds
      |_CBEventType.RETRIEVE -> 0.607878 seconds
        |_CBEventType.EMBEDDING -> 0.322019 seconds
      |_CBEventType.SYNTHESIZE -> 0.886264 seconds
        |_CBEventType.TEMPLATING -> 3.7e-05 seconds
        |_CBEventType.LLM -> 0.876407 seconds
**********
HACCP is a system that helps food business operators look at how they handle food and introduces procedures to make sure the food produced is safe to eat.


## Sub-question query engine

In [160]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.core import Settings
import nest_asyncio

nest_asyncio.apply()

llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

Settings.callback_manager = callback_manager

# setup base query engine as tool
query_engine_tools = [
    QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name="knowledgebase",
            description="HST knowledgebase",
        ),
    ),
]

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    use_async=True,
)

In [161]:
response = query_engine.query(
    "What are the differences between the levels 1,2 and 3 food hygiene courses. Is there a level 4 course? Please outline relevant audiences, course titles and prices"
)
print(response)

Generated 5 sub questions.
[1;3;38;2;237;90;200m[knowledgebase] Q: What are the differences between the levels 1, 2, and 3 food hygiene courses?
[0m[1;3;38;2;90;149;237m[knowledgebase] Q: Is there a level 4 food hygiene course available?
[0m[1;3;38;2;11;159;203m[knowledgebase] Q: What are the relevant audiences for the food hygiene courses?
[0m[1;3;38;2;155;135;227m[knowledgebase] Q: What are the course titles for the food hygiene courses?
[0m[1;3;38;2;237;90;200m[knowledgebase] Q: What are the prices of the food hygiene courses?
[0m[1;3;38;2;237;90;200m[knowledgebase] A: £20+VAT
[0m[1;3;38;2;155;135;227m[knowledgebase] A: Catering, Retail, and Manufacturing.
[0m[1;3;38;2;11;159;203m[knowledgebase] A: The relevant audiences for the food hygiene courses are food handlers working in catering, retail, or manufacturing environments.
[0m[1;3;38;2;90;149;237m[knowledgebase] A: There is no mention of a level 4 food hygiene course in the provided context information.
[0m[1;3

In [126]:
print("Full response object:", response)
print("\nResponse content:", response.response)
print("\nSource nodes:")
for node in response.source_nodes:
    print(f"- Node content: {node.node.text}")
    print(f"  Node score: {node.score}")
    #print(f"  Node document ID: {node.node.doc_id}")

# Metadata and other attributes
print("\nResponse metadata:", response.metadata)
print("Extra info:", response.extra_info)

Full response object: The main things to be aware of when handling abrasive wheels are ensuring the wheel is suitable for the machine and material, inspecting the wheel for damage, properly mounting and securing the wheel, using appropriate personal protective equipment, and following correct procedures for starting and stopping the machine. Regular training on safe handling practices and maintenance is also important.

Response content: The main things to be aware of when handling abrasive wheels are ensuring the wheel is suitable for the machine and material, inspecting the wheel for damage, properly mounting and securing the wheel, using appropriate personal protective equipment, and following correct procedures for starting and stopping the machine. Regular training on safe handling practices and maintenance is also important.

Source nodes:
- Node content: Sub question: What are the safety guidelines for handling abrasive wheels?
Response: The safety guidelines for handling abrasi

AttributeError: 'Response' object has no attribute 'extra_info'

## Citations

In [86]:
from llama_index.core.query_engine import CitationQueryEngine

In [162]:


citation_query_engine = CitationQueryEngine.from_args(
    index,
    similarity_top_k=3,
    # here we can control how granular citation sources are, the default is 512
    citation_chunk_size=512,
)

In [163]:
response = citation_query_engine.query("What is Natasha's Law?")
print(response)

**********
Trace: query
    |_CBEventType.QUERY -> 1.838546 seconds
      |_CBEventType.RETRIEVE -> 0.585612 seconds
      |_CBEventType.SYNTHESIZE -> 1.252355 seconds
        |_CBEventType.TEMPLATING -> 1.2e-05 seconds
        |_CBEventType.LLM -> 1.242375 seconds
**********
Natasha's Law is a new food labelling legislation that requires all foods produced and packed for sale at the same premises to be labeled with a full list of ingredients. This law was created after Natasha Ednan-Laperouse had a fatal allergic reaction, and it aims to provide greater transparency around food labelling requirements [1].


In [136]:
for i in range(len(response.source_nodes)):
    print(response.source_nodes[i].node.get_text())

Source 1:
Like all modern websites, we are using cookies to ensure our website works, provide you with a personalised experience, and help us make our website better for everyone.
Learn more
Let me decide I'm okay with this
(0)
New accreditation - View our range of City & Guilds Assured courses
Level 3 Food Hygiene Course in Catering
Duration 8-10 hours
Last audited 5th June 2023
For me
For teams
100% online training
Start when you like
Learn on any device (desktop, mobile or tablet)
Instant assessment and result
£130 +VAT
INCLUDE VAT
Add To Basket
This Level 3 Food Hygiene Training Course is designed for managers and supervisors in the catering industry to help them understand their essential day-to-day responsibilities, including how to implement the basics of a HACCP food safety management system.
The course provides knowledge of food hygiene practices and legal responsibilities and gives further detail on the controls that can be implemented to ensure that the food handling process

In [79]:
chat_engine = index.as_chat_engine()

In [89]:
chat_engine.chat_repl()

===== Entering Chat REPL =====
Type "exit" to exit.



Human:  What is allergen awareness?


Assistant: Allergen awareness involves understanding the potential severe outcomes of hypersensitivities and the importance of addressing allergies properly. It is crucial for businesses to be transparent, inclusive, and proactive in managing allergens to ensure the safety of customers. This includes effective communication, staff training on food allergens, and providing accurate allergen information to customers to create a safe dining experience for individuals with food hypersensitivities.



Human:  please explain in more detail


Assistant: Allergen awareness is the understanding and acknowledgment of the potential severe reactions that can occur in individuals with allergies. It involves recognizing the importance of addressing allergies properly to ensure the safety and well-being of individuals with food hypersensitivities. 

Businesses play a significant role in allergen awareness by being transparent, inclusive, and proactive in managing allergens. This includes:

1. Effective Communication: Businesses need to communicate clearly with customers about the presence of allergens in their products or dishes. This can be done through menu labeling, signage, and verbal communication with customers.

2. Staff Training: It is essential for businesses to provide comprehensive training to their staff on food allergens. Staff members should be knowledgeable about common allergens, cross-contamination risks, and how to handle allergen-related inquiries from customers.

3. Providing Accurate Allergen Information: Busin

KeyboardInterrupt: Interrupted by user