In [21]:
# Import dependencies and initate model
import os
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
import openai
from getpass import getpass
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv('.env')
openai.api_key = os.environ['OPENAI_API_KEY']

import logging
# Add logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

llm_gpt35 = OpenAI(
    model="gpt-35-turbo",
)

llm_gpt4 = OpenAI(
    model="gpt-4",
)

# You need to deploy your own embedding model as well as your own chat completion model
embed_model = OpenAIEmbedding(
    model="text-embedding-ada-002",
)

from llama_index.core import Settings

Settings.llm = llm_gpt4
Settings.embed_model = embed_model

In [10]:
# bring in our LLAMA_CLOUD_API_KEY
from dotenv import load_dotenv
import os
_ = load_dotenv('.env')
llamaparse_api = os.environ['LLAMA_CLOUD_API_KEY']
import nest_asyncio

nest_asyncio.apply()

# bring in deps
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

# set up parser
parser = LlamaParse(
    api_key=llamaparse_api,
    result_type="markdown",  # "markdown" and "text" are available
    verbose=True
)

file = "/Users/luchaojin/Library/CloudStorage/OneDrive-Personal/RAG Product/voice-rag/data/CA-1039.pdf"

# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=[file], file_extractor=file_extractor).load_data()
print(documents)

2024-09-25 22:22:45,401 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 760b4943-4029-46b1-966d-07733aaa83a2


2024-09-25 22:22:46,623 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/760b4943-4029-46b1-966d-07733aaa83a2 "HTTP/1.1 200 OK"
2024-09-25 22:22:46,865 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/760b4943-4029-46b1-966d-07733aaa83a2/result/markdown "HTTP/1.1 200 OK"


[Document(id_='4c55994b-e1e1-4bb4-9e7a-d431f73d9bf2', embedding=None, metadata={'file_path': '/Users/luchaojin/Library/CloudStorage/OneDrive-Personal/RAG Product/voice-rag/data/CA-1039.pdf', 'file_name': 'CA-1039.pdf', 'file_type': 'application/pdf', 'file_size': 986437, 'creation_date': '2024-09-25', 'last_modified_date': '2024-09-25'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='# CHERRY MAXIBOLT® PROCESS MANUAL', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), Document(id_='0ae3f540-b3a9-4218-bb7c-eee0935d8f11', embedding=None, metadata={'file_path': '/Users/luchaojin/Library/CloudStorage/OneDrive-Personal/RAG Product/voice-rag

In [None]:
documents_update = []
for index, document in enumerate(documents):
    document.metadata.update({"page_number": index+1})
    #print(document.metadata)
    documents_update.append(document)

In [30]:
documents_update[5].metadata

{'file_path': '/Users/luchaojin/Library/CloudStorage/OneDrive-Personal/RAG Product/voice-rag/data/CA-1039.pdf',
 'file_name': 'CA-1039.pdf',
 'file_type': 'application/pdf',
 'file_size': 986437,
 'creation_date': '2024-09-25',
 'last_modified_date': '2024-09-25',
 'page_number': 6}

In [16]:
from copy import deepcopy
from llama_index.core.schema import TextNode
from llama_index.core import VectorStoreIndex


def get_page_nodes(docs, separator="\n---\n"):
    """Split each document into page node, by separator."""
    nodes = []
    for doc in docs:
        doc_chunks = doc.text.split(separator)
        for doc_chunk in doc_chunks:
            node = TextNode(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            nodes.append(node)

    return nodes

In [31]:
page_nodes = get_page_nodes(documents_update)

In [79]:
page_nodes[6].metadata

{'file_path': '/Users/luchaojin/Library/CloudStorage/OneDrive-Personal/RAG Product/voice-rag/data/CA-1039.pdf',
 'file_name': 'CA-1039.pdf',
 'file_type': 'application/pdf',
 'file_size': 986437,
 'creation_date': '2024-09-25',
 'last_modified_date': '2024-09-25',
 'page_number': 7}

In [91]:
print(page_nodes[6].get_content())

# FASTENER IDENTIFICATION

# HEAD MARKINGS

|STANDARD MAXIBOLT|OVERSIZE MAXIBOLT|MAXIBOLT PLUS|
|---|---|---|
|Sleeve Material Code|Sleeve Material Code| |
|Manufacturer’s Identification|Manufacturer’s Identification| |
| |Designates “Oversize”| |
|Grip Identification|Grip Identification| |

1. Single digit marking is permissible for grip dash numbers less than 10
2. Sleeve Material

# MAXIBOLT — STANDARD AND OVERSIZE

|Material Code|Sleeve Material|
|---|---|
|C|Stainless Steel A-286 CRES, AMS 5737|
|MV or 7774|CP Titanium per ASTM - B348, Gr. 1|
|none|4037 Alloy Steel ASTM-A-331|

# MAXIBOLT PLUS

|Product Number|Material|Head Style|
|---|---|---|
|CR7680S|Stainless Steel|100° Flush Head (AN509)|
|CR7683S|Stainless Steel|Protruding Head|
|CR7684S|Stainless Steel|130° Flush Head|
|CR7686S|Stainless Steel|100° Flush Head (MS20426)|
|CR7784S|Titanium|130° Flush Head|


In [22]:
from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(
    llm=OpenAI(model="gpt-3.5-turbo-0125"), num_workers=4
)

In [33]:
nodes = node_parser.get_nodes_from_documents(documents_update)

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
2it [00:00, 39945.75it/s]
2024-09-25 22:46:51,452 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-25 22:46:51,661 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
3it [00:00, 35746.91it/s]
2024-09-25 22:46:52,906 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-25 22:46:52,964 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-25 22:46:53,495 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
1it [00:00, 10330.80it/s]
2024-09-25 22:46:54,927 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
3it [00:00, 36157.79it/s]
2024-09-25 22:46:56,227 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-25

In [67]:
nodes[20].metadata['table_df']

"{'Product Number': {0: 'CR7680S', 1: 'CR7683S', 2: 'CR7684S', 3: 'CR7686S', 4: 'CR7784S'}, 'Material': {0: 'Stainless Steel', 1: 'Stainless Steel', 2: 'Stainless Steel', 3: 'Stainless Steel', 4: 'Titanium'}, 'Head Style': {0: '100° Flush Head (AN509)', 1: 'Protruding Head', 2: '130° Flush Head', 3: '100° Flush Head (MS20426)', 4: '130° Flush Head'}}"

In [35]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [87]:
base_nodes[10].get_content()

'MAXIBOLT PLUS'

In [90]:
objects[4].metadata

{'file_path': '/Users/luchaojin/Library/CloudStorage/OneDrive-Personal/RAG Product/voice-rag/data/CA-1039.pdf',
 'file_name': 'CA-1039.pdf',
 'file_type': 'application/pdf',
 'file_size': 986437,
 'creation_date': '2024-09-25',
 'last_modified_date': '2024-09-25',
 'page_number': 7,
 'col_schema': 'Column: Product Number\nType: string\nSummary: None\n\nColumn: Material\nType: string\nSummary: None\n\nColumn: Head Style\nType: string\nSummary: None'}

In [39]:
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

In [42]:
# initialize client, setting path to save data
db = chromadb.PersistentClient(path="../swi_test")

# create collection
chroma_collection = db.get_or_create_collection("rag_swi_CA1039")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

2024-09-25 22:52:09,249 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [43]:
# create index
index = VectorStoreIndex(nodes=base_nodes + objects + page_nodes,
                          storage_context=storage_context)

2024-09-25 22:52:14,524 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-09-25 22:52:15,398 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [71]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

  from .autonotebook import tqdm as notebook_tqdm
2024-09-25 23:06:45,400 - INFO - PyTorch version 2.2.2 available.


In [103]:
# create a query engine and query
query_engine = index.as_query_engine(
    similarity_top_k=5, verbose=True
)

In [104]:
response = query_engine.query("head style of CR7680S")
print(response)

2024-09-26 07:54:09,190 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-09-26 07:54:10,577 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The context does not provide specific information about the head style of CR7680S.


In [122]:
response.response

'The "MS21140U05*" is a part number. The "U" in the part number indicates a special nose. The "05" is likely related to the grip length dash number. However, the exact details of "05" are not provided in the context. The "*" could potentially represent a suffix. For instance, a "P" suffix to MS21140 indicates a cadmium plate, and a "D" suffix indicates an aluminum IVD coating.'

In [125]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='df9fd999-2761-4654-94ec-e1a9676c8869', embedding=None, metadata={'file_path': '/Users/luchaojin/Library/CloudStorage/OneDrive-Personal/RAG Product/voice-rag/data/CA-1039.pdf', 'file_name': 'CA-1039.pdf', 'file_type': 'application/pdf', 'file_size': 986437, 'creation_date': '2024-09-25', 'last_modified_date': '2024-09-25', 'page_number': 6}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='1c863ccb-c28a-4ce7-8c87-59f6bf89fb3b', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/Users/luchaojin/Library/CloudStorage/OneDrive-Personal/RAG Product/voice-rag/data/CA-1039.pdf', 'file_name': 'CA-1039.pdf', 'file_type': 'application/pdf', 'file_size': 986437, 'creation_da

In [105]:
response = query_engine.query("Head Style of CR7680S")
print(response)

2024-09-26 07:54:10,843 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-09-26 07:54:14,788 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The head style of CR7680S is 100° Flush Head.


In [106]:
response = query_engine.query("Nose Piece Type of MS21140U05")
print(response)

2024-09-26 07:54:18,908 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-09-26 07:54:20,977 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The Nose Piece Type of MS21140U05 is U, which stands for Special Nose.


In [107]:
response = query_engine.query("nose piece type of MS21140U05")
print(response)

2024-09-26 07:54:35,839 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-09-26 07:54:37,611 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The nose piece type for MS21140U05 is 'U', which stands for Special Nose.


In [118]:
response = query_engine.query("MS21140U05*")
print(response)

2024-09-26 07:59:47,741 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-09-26 07:59:53,924 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The "MS21140U05*" is a part number. The "U" in the part number indicates a special nose. The "05" is likely related to the grip length dash number. However, the exact details of "05" are not provided in the context. The "*" could potentially represent a suffix. For instance, a "P" suffix to MS21140 indicates a cadmium plate, and a "D" suffix indicates an aluminum IVD coating.


In [135]:
for node in response.source_nodes:
    print(node.id_, node.get_content(), node.metadata.get('file_name'), node.metadata.get('page_number'))


df9fd999-2761-4654-94ec-e1a9676c8869 Basic Part Number (materials and head style)

Note: Refer to Cherry MAXIBOLT catalogue for complete list of finish alternates and codes. See page 6 for cross reference of Cherry MAXIBOLT and MS part numbers.

 MILITARY PART NUMBER IDENTIFICATION

MS90353 S 06 04 ( ) CA-1039.pdf 6
f1285d53-2aca-4c32-bd56-be8e4994b03b (1) Per NASM81177

(2) Per NASM8975 and Cherry Aerospace PS-CMB-7000 CA-1039.pdf 20
eb3d704b-bbb5-40d2-880b-9b7626be8a7a MAXIBOLT PLUS CA-1039.pdf 7
a3795d2b-7598-47f9-acef-74b5b3ef5eb1 * Grip Length dash number — see page XXXX

** Installation Riveter Codes: DA=Double Action, S=Single Action,

***B - Blunt type; U - Special Nose

P- Suffix to MS21140 and MS21141 indicates cadmium plate

D- Suffix to MS90353 and MS90354 indicates aluminum IVD coating

P- Suffix to Cherry Part No.for A286 CRES parts indicate cadmium plate

D- Suffix to Cherry indicates aluminum IVD coating;

Note: There is no cross-reference between the Cherry Titanium MA

In [78]:
response.source_nodes[4]

NodeWithScore(node=TextNode(id_='d0b236ca-e7da-4ae2-8726-916dda6cfdb4', embedding=None, metadata={'file_path': '/Users/luchaojin/Library/CloudStorage/OneDrive-Personal/RAG Product/voice-rag/data/CA-1039.pdf', 'file_name': 'CA-1039.pdf', 'file_type': 'application/pdf', 'file_size': 986437, 'creation_date': '2024-09-25', 'last_modified_date': '2024-09-25', 'page_number': 7}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6764f192-4014-4a95-9af9-6e2e4b328a8f', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/Users/luchaojin/Library/CloudStorage/OneDrive-Personal/RAG Product/voice-rag/data/CA-1039.pdf', 'file_name': 'CA-1039.pdf', 'file_type': 'application/pdf', 'file_size': 986437, 'creation_dat