In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

from llama_index import download_loader
from llama_index import VectorStoreIndex
from llama_index import ServiceContext

from llama_index import get_response_synthesizer
from llama_index.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.query_engine.retriever_query_engine import (
RetrieverQueryEngine,
)

from langchain.llms import OpenAI
from langchain.embeddings import SentenceTransformerEmbeddings

In [2]:
# Initialize the SentenceTransformerEmbeddings with the loaded model
local_embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
llm = OpenAI(openai_api_key="NULL",temperature=0,openai_api_base="http://192.168.48.33:1234/v1")
service_context = ServiceContext.from_defaults(llm=llm, embed_model=local_embeddings)

  from .autonotebook import tqdm as notebook_tqdm


# Data 

In [3]:
index_path = "http://192.168.48.22:8082/repository/qa/QMS.html"

# All links

In [4]:
def get_all_links(base_url):
    # Send a GET request to the base URL
    response = requests.get(base_url)
    response.raise_for_status()  # Raise an exception for HTTP errors

    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all anchor tags
    anchors = soup.find_all('a', href=True)

    # Convert relative URLs and fragments to absolute URLs
    absolute_urls = [urljoin(base_url, a['href']) for a in anchors]

    return absolute_urls

In [5]:
linked_urls = get_all_links(index_path)

In [9]:
links = linked_urls

# Lists to store categorized links
html_links = []
pdf_links = []
xlsx_links = []
xls_links = []
docx_links = []
doc_links = []
pptx_links = []
zip_links = []

# Categorize links
for link in links:
    if link.endswith('.html'):
        html_links.append(link)
    elif link.endswith('.pdf'):
        pdf_links.append(link)
    elif link.endswith('.xlsx'):
        xlsx_links.append(link)
    elif link.endswith('.xls'):
        xls_links.append(link)
    elif link.endswith('.docx'):
        docx_links.append(link)
    elif link.endswith('.doc'):
        doc_links.append(link)
    elif link.endswith('.pptx'):
        pptx_links.append(link)
    elif link.endswith('.zip'):
        zip_links.append(link)

In [10]:
print("HTML:")
for link in html_links:
    print(link)

HTML:
http://192.168.48.22:8082/repository/qa/QMS.html
http://192.168.48.22:8082/repository/qa/Quality_Manual.html
http://192.168.48.22:8082/repository/qa/Kratos_Organization_Note.html
http://192.168.48.22:8082/repository/qa/Kratos_Diagram.html
http://qms-toulouse.kratos.us/QMS.html
http://qms-toulouse.kratos.us/internal/internal-process.html
http://192.168.48.22:8082/repository/qa/Document_Management.html
http://192.168.48.22:8082/repository/qa/Document_Publication.html
http://192.168.48.22:8082/repository/qa/Human_Resources.html
http://192.168.48.22:8082/repository/qa/Account_Management.html
http://192.168.48.22:8082/repository/qa/Infrastructure.html
http://192.168.48.22:8082/repository/qa/Project_Folder_Organization.html
http://192.168.48.22:8082/repository/qa/Configuration_Management.html
http://192.168.48.22:8082/repository/qa/Work%20Instructions/WI_Project_Library.html
http://192.168.48.22:8082/repository/qa/Configuration_Management_for_Projects.html
http://192.168.48.22:8082/rep

# Scrap 

In [11]:
BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")

In [12]:
loader = BeautifulSoupWebReader()

In [13]:
# Initialize an empty list to collect documents.
docs = []

# Iterate over each URL in the html_links list.
for url in html_links:
    try:
        # Attempt to load data from each URL individually.
        # Make sure to pass the URL as a list to the 'urls' parameter.
        doc = loader.load_data(urls=[url])  # Corrected the parameter here
        docs.append(doc)
    except Exception as e:
        # Print the error and the URL that caused it for debugging purposes.
        print(f"Error loading {url}: {e}")

Error loading http://qms-toulouse.kratos.us/QMS.html: One of the inputs is not a valid url: http://qms-toulouse.kratos.us/QMS.html
Error loading http://qms-toulouse.kratos.us/internal/internal-process.html: One of the inputs is not a valid url: http://qms-toulouse.kratos.us/internal/internal-process.html


In [14]:
type(docs), len(docs)

(list, 40)

In [40]:
docs[0]

[Document(id_='e1d8c258-b609-495a-bc77-7d651e85f87e', embedding=None, metadata={'URL': 'http://192.168.48.22:8082/repository/qa/QMS.html'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='b475d0b49b905f17589e1d85d52e86f4f851905bbf49cd00de4c5abb1622554e', text='\n\n\n\n\n\nQuality Management System\n\n\n\n\n\n\nQuality Management System\n\n\nKratos Logo\n\n\nHomeGitlabPDF\n\n\n\n\nTable of Contents\n\n\nIntroduction\nQuality Management System\nReference Documents\nList of Processes\n\nDocuments\nHuman Resources\nInfrastructure\nOperations\nProposals and Contracts\nPurchasing\nQuality\nShipping and Receiving\nSupport\n\nTraining Materials\nHealth, Safety and Environment\nKratos Communications SAS Process Identification\nTemplate Version\n\n\n\n1 Introduction\nWelcome to Kratos QMS\nQuality Policy\n"Our goal at Kratos Communications SAS is to be the leading provider of products, systems, and services, tailored to our customers’ specific needs.\nOur 

# Test 

In [45]:
# Flatten the list of lists into a single list of documents
flattened_docs = [doc for sublist in docs for doc in sublist]

In [46]:
for i, doc in enumerate(flattened_docs[:5]):
    print(f"Document {i}: Type - {type(doc)}")
    print(doc)  # Or print specific attributes if needed


Document 0: Type - <class 'llama_index.schema.Document'>
Doc ID: e1d8c258-b609-495a-bc77-7d651e85f87e
Text: Quality Management System       Quality Management System
Kratos Logo   HomeGitlabPDF     Table of Contents   Introduction
Quality Management System Reference Documents List of Processes
Documents Human Resources Infrastructure Operations Proposals and
Contracts Purchasing Quality Shipping and Receiving Support  Training
Materials Health, Saf...
Document 1: Type - <class 'llama_index.schema.Document'>
Doc ID: 18c0c904-d91c-4813-8c45-7646256b882c
Text: Quality Manual       Quality Manual   Kratos Logo
HomeGitlabPDF     Table of Contents   Introduction References Terms
and Definitions Context of the Organization  Understanding the
organization and its contexts Understanding the needs and expectation
of interested Parties Determining the scope of the quality management
system Quality Managemen...
Document 2: Type - <class 'llama_index.schema.Document'>
Doc ID: 9a09c69f-4487-4dde-ba9

# Default vector store 

In [47]:
index = VectorStoreIndex.from_documents(flattened_docs, service_context=service_context)

In [48]:
# Retrieve the document information
doc_info = index.ref_doc_info

In [49]:
# Get the number of documents
num_documents = len(doc_info)
print(f"Number of documents in the index: {num_documents}")

Number of documents in the index: 40


In [50]:
# Print details of each document (optional)
for doc_id, info in doc_info.items():
    print(f"Document ID: {doc_id}, Info: {info}")

Document ID: e1d8c258-b609-495a-bc77-7d651e85f87e, Info: RefDocInfo(node_ids=['22864390-08bb-4452-bd9f-f255c3ffa4fd', '245f4c97-5f4a-4a4d-a786-ac793a5b1de8'], metadata={'URL': 'http://192.168.48.22:8082/repository/qa/QMS.html'})
Document ID: 18c0c904-d91c-4813-8c45-7646256b882c, Info: RefDocInfo(node_ids=['375d07b1-86ae-4063-9f31-b99895086203', 'dfa5709b-dc92-44ea-bfa4-edd430777056', '6b8fa202-e8a3-459a-b373-43ffd67132f9', '2de06e12-b52b-4325-880c-8cf4dc2a394f', '2769e71a-5345-4461-9309-666086394255', '0db276d7-e3e6-417b-8266-2d3ae7b1d500', '93b4edb8-6b84-4c9a-b1a2-5dd549f014fa', 'aa4af12d-229a-437c-bf32-c64e5c5903d2', '51e53839-2ca5-40dc-864e-88db514ac6e6', 'b9345f84-5284-42cd-8864-1cc2d00ac932', 'e0fe2aed-a444-41f4-895b-486fe9333c96'], metadata={'URL': 'http://192.168.48.22:8082/repository/qa/Quality_Manual.html'})
Document ID: 9a09c69f-4487-4dde-ba9a-3f3652d25904, Info: RefDocInfo(node_ids=['c976abbb-3d20-434c-8532-3560cc0f7ef1', 'd42cbc96-849a-4d30-8242-d16117d03c55', 'dc9f42a8-25c

# Query 

In [51]:
query_engine = index.as_query_engine()

In [52]:
response = query_engine.query("what is the wif ?")
response.response

'\n\nThe Warranty Inspection Form (WIF) is a document used to collect key information for the project. It is used in the receiving process to verify that the goods & services are compliant to the purpose.'

# Auto retriever

In [57]:
# build retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=3,
    vector_store_query_mode="default",
    filters=[],
    alpha=None,
    doc_ids=None,
)

# build query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever, response_synthesizer=get_response_synthesizer(service_context)
)

In [58]:
response = query_engine.query("what is the wif ?")
response.response

'\n\nThe Warranty Inspection Form (WIF) is a document used to collect key information for the project. It is used to verify that the goods & services are compliant to the purpose.'

In [59]:
response = query_engine.query("Tell me more about manager role ?")
response.response

'\n\nThe manager role is a critical one in any organization as they are responsible for leading and guiding their teams towards achieving specific goals and objectives. Managers play a key role in ensuring that projects are completed on time, within budget, and to the required quality standards. They also have to manage resources effectively, including personnel, equipment, and materials, to ensure that projects are completed successfully.\n\nIn addition to these responsibilities, managers must also be able to communicate effectively with their teams, stakeholders, and customers. This includes providing clear instructions, feedback, and guidance, as well as being able to listen actively to concerns and issues raised by team members and stakeholders.\n\nManagers must also be able to adapt to changing circumstances and be flexible in their approach to problem-solving. They must be able to think critically and make informed decisions that take into account the needs of the organization, a

In [60]:
response = query_engine.query("Who should I contact for a computer issue")
response.response

'\n\nYou can contact Loeki Vautrin, IT Technician at Loeki.Vautrin@kratosdefense.com or #153 for computer issues.'

In [61]:
response

Response(response='\n\nYou can contact Loeki Vautrin, IT Technician at Loeki.Vautrin@kratosdefense.com or #153 for computer issues.', source_nodes=[NodeWithScore(node=TextNode(id_='b285f5a8-5b29-457b-97fa-7b4cdbc3ad8a', embedding=None, metadata={'URL': 'http://192.168.48.22:8082/repository/qa/Receiving_Process.html'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ba0a97ff-11e8-4aec-a94f-7651f3e50e77', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'URL': 'http://192.168.48.22:8082/repository/qa/Receiving_Process.html'}, hash='b052caf574856562e1518eb1161cfbd6bcc72185b3552ebcd2728c4219800d07'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='f46ae8c3-e5df-4e29-b640-31296bb0246d', node_type=<ObjectType.TEXT: '1'>, metadata={'URL': 'http://192.168.48.22:8082/repository/qa/Receiving_Process.html'}, hash='4b6bab6f1ce840018120af37b43b59e131d4ff0247ef6821ac76e0d734a23516'), <NodeRelationship.N