In [None]:
! pip install llama-index llama-index-vector-stores-vertexaivectorsearch llama-index-llms-vertex llama-index-storage-docstore-firestore
! pip install --upgrade google-cloud-documentai

In [None]:
# https://cloud.google.com/vertex-ai/docs/vector-search/quickstart#enable-apis
! gcloud services enable compute.googleapis.com aiplatform.googleapis.com storage.googleapis.com --project ai-sandbox-company-73

In [1]:
import logging
import os
import sys
import yaml
from google.cloud import aiplatform
from vector_search_utils import get_or_create_existing_index
from docai_parser import DocAIParser
from google.oauth2 import service_account
from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
from llama_index.vector_stores.vertexaivectorsearch import VertexAIVectorStore
from llama_index.storage.docstore.firestore import FirestoreDocumentStore
from llama_index.embeddings.vertex import VertexTextEmbedding
from llama_index.llms.vertex import Vertex
# Add the common directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../')))
from common.utils import (
    create_pdf_blob_list,
    download_bucket_with_transfer_manager,
    link_nodes,
)

In [2]:
logging.basicConfig(level=logging.INFO)  # Set the desired logging level
logger = logging.getLogger(__name__)

In [3]:
# Load configuration from config.yaml
def load_config():
    config_path = os.path.join(
        os.path.dirname(''), "..", "..", "common", "config.yaml"
    )
    with open(config_path) as config_file:
        return yaml.safe_load(config_file)

In [4]:
# Load configuration
config = load_config()


# Initialize parameters
PROJECT_ID = config["project_id"]
LOCATION = config["location"]
INPUT_BUCKET_NAME = config["input_bucket_name"]
DOCSTORE_BUCKET_NAME = config["docstore_bucket_name"]
INDEX_ID = config["index_id"]
VECTOR_INDEX_NAME = config["vector_index_name"]
INDEX_ENDPOINT_NAME = config["index_endpoint_name"]
INDEXING_METHOD = config["indexing_method"]
CHUNK_SIZES = config["chunk_sizes"]
EMBEDDINGS_MODEL_NAME = config["embeddings_model_name"]
LLM_MODEL_NAME = config["llm_model_name"]
APPROXIMATE_NEIGHBORS_COUNT = config["approximate_neighbors_count"]
BUCKET_PREFIX = config["bucket_prefix"]
VECTOR_DATA_PREFIX = config["vector_data_prefix"]
CHUNK_SIZE = config.get("chunk_size", 512)
CHUNK_OVERLAP = config.get("chunk_overlap", 50)
DOCAI_LOCATION = config["docai_location"]
DOCAI_PROCESSOR_DISPLAY_NAME = config["document_ai_processor_display_name"]
DOCAI_PROCESSOR_ID = config.get("docai_processor_id")
CREATE_DOCAI_PROCESSOR = config.get("create_docai_processor", False)
FIRESTORE_DB_NAME = config.get("firestore_db_name")
FIRESTORE_NAMESPACE = config.get("firestore_namespace")
QA_INDEX_NAME = config.get("qa_index_name")
QA_ENDPOINT_NAME = config.get("qa_endpoint_name")
GCS_OUTPUT_PATH = f"gs://{DOCSTORE_BUCKET_NAME}/{VECTOR_DATA_PREFIX}/docai_output/"
GOOGLE_CREDENTIAL_PATH = config.get("credential")

# Google Service Account credentials
google_credential_path = os.path.join(os.path.dirname(''), "..", "..", GOOGLE_CREDENTIAL_PATH)
google_credential = service_account.Credentials.from_service_account_file(google_credential_path)

# Main

In [5]:
# Initialize Vertex AI and create index and endpoint
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Creating Vector Search Index
vs_index, vs_endpoint = get_or_create_existing_index(
    VECTOR_INDEX_NAME, INDEX_ENDPOINT_NAME, APPROXIMATE_NEIGHBORS_COUNT, google_credential
)

Found existing index: index_hierarchical
Found existing endpoint: index_endpoint
Using existing deployed index and endpoint


In [6]:
# Vertex AI Vector Search Vector DB and Firestore Docstore
vector_store = VertexAIVectorStore(
    project_id=PROJECT_ID,
    region=LOCATION,
    index_id=vs_index.name,  # Use .name instead of .resource_name as it contains the full path
    endpoint_id=vs_endpoint.name,  # Use .name instead of .resource_name
    gcs_bucket_name=DOCSTORE_BUCKET_NAME,
    credentials_path=google_credential_path
)
# TODO: Add service account credentials for Firestore
docstore = FirestoreDocumentStore.from_database(
    project=PROJECT_ID, database=FIRESTORE_DB_NAME, namespace=FIRESTORE_NAMESPACE
)

# Setup embedding model and LLM
embed_model = VertexTextEmbedding(
    model_name=EMBEDDINGS_MODEL_NAME, project=PROJECT_ID, location=LOCATION, credentials=google_credential
)
llm = Vertex(model=LLM_MODEL_NAME, temperature=0.0)
Settings.llm = llm
Settings.embed_model = embed_model

# Initialise Document AI parser
parser = DocAIParser(
    project_id=PROJECT_ID,
    location=DOCAI_LOCATION,
    processor_name=f"projects/{PROJECT_ID}/locations/{DOCAI_LOCATION}/processors/{DOCAI_PROCESSOR_ID}",  # noqa: E501
    gcs_output_path=GCS_OUTPUT_PATH
)

# Download data from specific bucket and parse
local_data_path = os.path.join("/tmp", BUCKET_PREFIX)
os.makedirs(local_data_path, exist_ok=True)
blobs = create_pdf_blob_list(INPUT_BUCKET_NAME, BUCKET_PREFIX)
logger.info("downloading data")
download_bucket_with_transfer_manager(
    INPUT_BUCKET_NAME, prefix=BUCKET_PREFIX, destination_directory=local_data_path
)

# Parse documents using Document AI
try:
    parsed_docs, raw_results = parser.batch_parse(
        blobs, chunk_size=CHUNK_SIZE, include_ancestor_headings=True
    )
    print(f"Number of documents parsed by Document AI: {len(parsed_docs)}")
    if parsed_docs:
        print(
            f"First parsed document text (first 100 chars): {parsed_docs[0].text[:100]}..."
        )
    else:
        print("No documents were parsed by Document AI")

    # Print raw results for debugging
    print("Raw results from Document AI:")
    for result in raw_results:
        print(f"  Source: {result.source_path}")
        print(f"  Parsed: {result.parsed_path}")

except Exception as e:
    print(f"Error parsing documents: {str(e)}")
    parsed_docs = []
    raw_results = []

# Turn each parsed document into llama_index Document
li_docs = [Document(text=doc.text, metadata=doc.metadata) for doc in parsed_docs]


INFO:common.utils:<google.api_core.page_iterator.HTTPIterator object at 0x17964f510>
INFO:__main__:downloading data
INFO:common.utils:Failed to download raw_dataset/ due to exception: [Errno 21] Is a directory: '/tmp/raw_dataset/raw_dataset/'
INFO:common.utils:Downloaded raw_dataset/CP 83-1-2004 (2015) CoP for construction CAD - Organisation n naming of CAD layers.pdf to /tmp/raw_dataset/raw_dataset/CP 83-1-2004 (2015) CoP for construction CAD - Organisation n naming of CAD layers.pdf.
INFO:common.utils:Downloaded raw_dataset/CP 83-2-2000 (2015)+Amd1 CoP for CAD - CAD symbols.pdf to /tmp/raw_dataset/raw_dataset/CP 83-2-2000 (2015)+Amd1 CoP for CAD - CAD symbols.pdf.
INFO:common.utils:Downloaded raw_dataset/CP 83-3-2001 (2015) CoP for CAD - Organising n naming of CAD files.pdf to /tmp/raw_dataset/raw_dataset/CP 83-3-2001 (2015) CoP for CAD - Organising n naming of CAD files.pdf.
INFO:common.utils:Downloaded raw_dataset/CP 83-4-2001 (2015) CoP for construction CAD - CAD drafting conventi

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'mimetype', 'path']
Batch process started. Operation: <google.api_core.operation.Operation object at 0x1795ff310>
Number of operations started: 1
Operations completed successfully.
Operation 1 metadata: state: SUCCEEDED
create_time {
  seconds: 1730792668
  nanos: 672146000
}
update_time {
  seconds: 1730793390
  nanos: 441078000
}
individual_process_statuses {
  input_gcs_source: "gs://ai-sandbox-jacobs_dataset/raw_dataset/CP 83-1-2004 (2015) CoP for construction CAD - Organisation n naming of CAD layers.pdf"
  status {
  }
  output_gcs_destination: "gs://ai-sandbox-jacobs_rag-db/vector_data/docai_output/7071713667348882830/0"
  h