#### Imports

In [0]:
# %run "../SAP_FDD_Env_and_Utils"

In [0]:
import uuid

In [0]:
ACS_ENDPOINT = "https://u2zeapebsdse001.search.windows.net"
ACS_KEY = "vO7O3aspEXtj2qeFTOKl2rLSTNcijSVDN5JNjUV5qxAzSeD0om1d"

### Function library

#### Create indexes

##### Preparation (reading & chunking)
This is used by both the embedded and plain text index. It's crucial that they both have the same document body as source, inluding the same chunking.

In [0]:
def read_documents(zip_filename, zip_dir, blacklist_docs=[]):
  # load and chunk word documents
  print("Parsing")
  documents = parse_fsds(blacklist_docs=blacklist_docs, kg_structure_name=zip_filename, kg_directory=zip_dir)
  # warnings.filterwarnings("ignore")
  return documents

In [0]:
def read_documents_transcripts(doc_path, combine=True, heading=True):
  
  ## Read the transcripts
  transcripts_new = parse_transcripts(doc_path,combine=combine,heading=heading)

  ## Quick intermediate step - Write restructured back to JSON (needed for the JSONLoader below)
  intermediate_path = "/tmp/SAP/cleansed_transcript_new_restructured.json"
  with open(intermediate_path, 'w+') as f:
      json.dump(transcripts_new, f)

  ## Required to retrieve filename as metadata
  def metadata_func(record: dict, metadata: dict) -> dict:
      metadata["transcript_name"] = record.get("filename")
      return metadata

  ## Load restructured JSON into langchain (create documents)
  loader = JSONLoader(
      file_path=intermediate_path,
      jq_schema='.[]',
      content_key="text",
      metadata_func = metadata_func)
  texts = loader.load()

  return texts

In [0]:
def chunk_documents(documents, chunk_size=5000, chunk_overlap=1000, heading=True):
  print("Chunking")
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) #TokenTextSplitter
  texts = text_splitter.split_documents(documents)
  if heading:
    for index,chunk in enumerate(texts):
      source_filepath=chunk.metadata['source']
      source_filename=source_filepath[:source_filepath.rfind(".")].split('/')[-1]
      texts[index].page_content='Document Name: '+source_filename+'\n\n'+chunk.page_content
  return texts

##### Embedded index (FAISS etc.)

In [0]:
def create_embedded_index(documents,index_dir,index_name,embedding_type='OpenAI',index_type='FAISS', save_to_workbench=True):

  print("Creating embeddings")
  # create embeddings
  if embedding_type.replace(' ','').lower()=='huggingface':
    embeddings = HuggingFaceEmbeddings()
  elif embedding_type.replace(' ','').lower()=='openai':
    embeddings = OpenAIEmbeddings(
      deployment=deployment_id_ada,
      model="text-embedding-ada-002",
      openai_api_key=openai_api_key,
      chunk_size=1
    )

  # create vector store
  print("Embedding text")
  if index_type.replace(' ','').lower()=='faiss':
    db = FAISS.from_documents(documents, embeddings)
  # elif index_type.replace(' ','').lower()=='chroma':
  #   db = Chroma.from_documents(texts, embeddings)

  if not save_to_workbench or index_dir==None or index_name==None :
    return db
  
  else:
    print("Saving index to file")
    ## Save in local directory
    if not os.path.exists(index_dir):
      os.makedirs(index_dir)

    with open(os.path.join(index_dir,index_name), "wb") as f:
      pickle.dump(db,f)
    db_ws.CopyFileToWorkbench(filename=index_name, DatabricksFolder="file:"+index_dir)
    print('Index'+index_name+' pickled and copied to Workbench')
    return db

##### Plain text index (Whoosh)

In [0]:
class DocumentIndexWhoosh():
  def from_documents(self, documents, index_path_out, stem=True):

    ## Create schema for indexing
    stem_ana = StemmingAnalyzer()
    schema = Schema(title=ID(stored=True), content=TEXT(stored = True), content_stem=TEXT(analyzer=stem_ana))
    
    ## Create index
    if not os.path.exists(index_path_out):
      os.mkdir(index_path_out)
    else:
      shutil.rmtree(index_path_out)
      os.mkdir(index_path_out)
    ix = create_in(index_path_out, schema)
    writer = ix.writer()

    document_dict = []
    for document in documents:
      document_dict.append({"text": document.page_content, "filename": document.metadata["source"]})

    ## Fill index with documents
    for document in document_dict:
      writer.add_document(title=document["filename"], content=document["text"], content_stem=document["text"])
    writer.commit()
    self._index = ix

    ## Configure parser to parse search queries
    search_field = "content"
    if stem: search_field = "content_stem"
    parser = QueryParser(search_field, self._index.schema, group=OrGroup.factory(0.9))
    # using OrGroup.factory() with a high value adds a bonus if multiple query words are retrieved (see https://whoosh.readthedocs.io/en/latest/parsing.html)
    parser.add_plugin(FuzzyTermPlugin())
    self._parser = parser
    return self

  def load(self, index_path_in):
    ix = open_dir(index_path_in)
    self._index = ix
    search_field = "content_stem"
    parser = QueryParser(search_field, self._index.schema, group=OrGroup.factory(0.9))
    # using OrGroup.factory() with a high value adds a bonus if multiple query words are retrieved (see https://whoosh.readthedocs.io/en/latest/parsing.html)
    parser.add_plugin(FuzzyTermPlugin())
    self._parser = parser
    return self
  
  def search(self, query, k=5, stem=True, fuzzy_dist=0):
    if stem:
      ana = StemmingAnalyzer()
      query = " ".join([token.text + "~" + str(fuzzy_dist) for token in ana(query)]) #'~1' says that each word can be one character off (fuzzy search)
    else:
      query = " ".join([word + "~" + str(fuzzy_dist) for word in query.split(" ")])
    try:
      # with self._index.searcher() as searcher:
      searcher = self._index.searcher()
      parsed_query = self._parser.parse(query)
      res = searcher.search(parsed_query, terms=True, limit=k)
      # Return only the content of the best result (to be able to close the searcher().
      # If returning res in full, we'd get an "ReaderClosed" exception.)
      return res
    except Exception as err:
      print("Error in searching whoosh index. Returning empty string (''). Traceback: " + str(err))
      return ""
    
  def search_and_summarize(self, search_term, k=3, stem=True, fuzzy_dist=0):
    res = "None"
    try:
    
      # Get initial results from whoosh search
      whoosh_results = self.search(search_term, stem=stem, fuzzy_dist=fuzzy_dist)

      # avoid index errors (e.g. when we specify top 5, but whoosh only retrieves 3 docs)
      if len(whoosh_results) < k:
        k = len(whoosh_results)
      
      # concatenate top k results from the whoosh search in plain text
      top_k_docs = [entry["content"] for entry in whoosh_results[:k]]

      # have GPT summarize the whoosh-retrieved documents (with a focus on the search keywords)
      summary_prompt = f"""###RequirementStart:
      {search_term}
      ###RequirementEnd

      What detailed granular information from the documents below can be leveraged to build the above requirement? Response should only be each document name followed by relevant info. Elaborate as much as possible. Documents:
      '''{top_k_docs}'''
      """
      res = gpt4(summary_prompt, large=True, max_tokens=10000, tries=3, temperature=.2)

    except Exception as err:
      log("[Whoosh Retrieval] Error in searching documents using whoosh & GPT. Returning 'None'. Traceback: ")
      print(err)

    finally:
      return res
    
## Could also be loaded via ix = index.open_dir("indexdir")

##### Combined index (Embedded & plain text)

In [0]:
class CombinedIndex():

  def __init__(self, faiss_index, whoosh_index):
    self._faiss_index = faiss_index
    self._whoosh_index = whoosh_index
    self._validate_chunk_equality()

  def _validate_chunk_equality(self):
    ## Check if individual whoosh and faiss chunks are identical
    chunks_whoosh = []
    chunks_faiss = []
    for doc in self._whoosh_index._index.searcher().documents():
      chunks_whoosh.append(doc["content"])
    for doc_id in self._faiss_index.docstore._dict.keys():
      chunks_faiss.append(self._faiss_index.docstore._dict[doc_id].page_content)
    if not len(chunks_whoosh) == len(chunks_faiss): raise ValueError("Whoosh and FAISS have do not have the same amount of chunks")
    for i in range(len(chunks_faiss)):
      if not chunks_whoosh[i] == chunks_faiss[i]: raise ValueError("Whoosh and FAISS chunk text different for chunk number " + str(i))

  def search(self, query, k=5, verbose=False):
    whoosh_results_raw = self._whoosh_index.search(query, k=k)
    k_whoosh = min(k, len(whoosh_results_raw)) # avoid index errors (e.g. when we specify top 5, but whoosh only retrieves 3 docs)
    if verbose and k_whoosh < k: log(f"[Combined Index Lookup] Whoosh only retrieved k={k_whoosh} results. You specified k={k}.")
    whoosh_results = [entry["content"] for entry in whoosh_results_raw[:k_whoosh]]

    faiss_results_raw = self._faiss_index.similarity_search(query, k)
    k_faiss= min(k, len(faiss_results_raw)) # avoid index errors (e.g. when we specify top 5, but whoosh only retrieves 3 docs)
    if verbose and k_faiss < k: log(f"[Combined Index Lookup] FAISS only retrieved k={k_faiss} results. You specified k={k}.")
    faiss_results = [entry.page_content for entry in faiss_results_raw[:k_faiss]]

    combined_results = [x for x in itertools_chain.from_iterable(zip_longest(whoosh_results,faiss_results)) if x]
    # complicated, but needed to preserve scoring order. essentially, this constructes an intertwined list aka [whoosh_1, faiss_1, whoosh_2, faiss_2, ...]
    # instead of a naive append ([whoosh_1, whoosh_2, ... , faiss_1, faiss_2] or vice versa).
    # note that this prioritizes whoosh over faiss for the first result - change to zip_longest(faiss_results, whoosh_results) to reverse

    duplicates = set()
    unique_results = []
    for x in combined_results:
      if x not in unique_results:
        unique_results.append(x)
      else:
        duplicates.add(x)
    if verbose and len(duplicates) > 0: log(f"[Combined Index Lookup] FAISS and Whoosh returned partially same results. Removed {len(duplicates)} duplicates.")

    return unique_results
  
  def search_and_summarize(self, search_term, k=5, verbose=False):
    res = "None"
    try:
    
      # Get initial results from whoosh search
      top_k_docs = self.search(query=search_term, k=k, verbose=verbose)

      # have GPT summarize the whoosh-retrieved documents (with a focus on the search keywords)
      summary_prompt = f"""###RequirementStart:
      {search_term}
      ###RequirementEnd

      What detailed granular information from the documents below can be leveraged to build the above requirement? Response should only be each document name followed by relevant info. Elaborate as much as possible. Documents:
      '''{top_k_docs}'''
      """
      res = gpt4(summary_prompt, large=True, max_tokens=5000, tries=3, temperature=.2)

    except Exception as err:
      log("[Combined Index Lookup] Error in searching meeting transcripts using the combined Whoosh & FAISS index. Returning 'None'. Traceback: ")
      print(err)

    finally:
      return res

##### ACS index

In [0]:
# %sh
# echo """[global]
# index-url=https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
# """ > /etc/pip.conf

In [0]:
#!pip install --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/ azure-search-documents==11.4.0a20230509004

# !pip install azure-search-documents==11.4.0b6

In [0]:
# %sh
# echo "" > /etc/pip.conf

In [0]:
class DocumentIndexACS_v2():

  def __init__(self, acs_endpoint, acs_key, group_id):
    ### Store endpoints, credentials, config variables
    self._acs_endpoint = acs_endpoint
    self._acs_key = acs_key
    self._acs_credential = AzureKeyCredential(self._acs_key)
    self.group_id = group_id
    self.index_name = "aifactory-usecases"

  def clear(self):

    self.load()

    # delete all documents for given group_id
    log(f"[ACS Index] Clearing group_id '{self.group_id}' for index '{self.index_name}' on ACS ...")
    results = list(self._search_client.search(search_text=None, filter=f"group_ids eq '{self.group_id}'", top=1000))
    if len(results) == 0:
      log(f"[ACS Index] Found no documents for group_id '{self.group_id}' for index '{self.index_name}' on ACS.")
      return
    ids = [r['id'] for r in results]
    keys =[]
    for i in ids:
      keys += [{"id": i}]
      
    deleted = self._search_client.delete_documents(keys)
    log(f"[ACS Index] Deleted {len(deleted)} documents for group_id '{self.group_id}' for index '{self.index_name}' on ACS.")

  def from_documents(self, documents, embedding_type='OpenAI'):
    log(f"[ACS Index] Fetching index '{self.index_name}' on Azure Cognitive Search ...")
    self.clear()
    self.add_documents(documents, embedding_type)
  
  def add_documents(self, documents, embedding_type='OpenAI'):

    self.load()

    ### Embed documents
    log("[ACS Index] Embedding the documents ...")
    embedded_documents = self._embeddings.embed_documents([doc.page_content for doc in documents])

    ### Create structure for ACS upload
    log("[ACS Index] Generate structure for Azure Cognitive Search upload ...")
    acs_dict = []
    for i, doc in enumerate(documents):
      acs_dict.append({
          'id': str(uuid.uuid1()),
          'title': doc.metadata["source"],
          'filename': doc.metadata["source"],
          'content': doc.page_content,
          'content_vector': embedded_documents[i],
          "@search.action": "upload",
          "group_ids": self.group_id
      })

    ### Upload to ACS
    log("[ACS Index] Uploading the embedded documents to Azure Cognitive Search ...")
    search_client = SearchClient(endpoint=self._acs_endpoint,
                                  index_name=self.index_name,
                                  credential=self._acs_credential)
    self._search_client = search_client

    result = self._search_client.upload_documents(acs_dict)  
    log(f"[ACS Index] Uploaded {len(acs_dict)} documents")
    return self

  def load(self):
    search_client = SearchClient(endpoint=self._acs_endpoint,
                                 index_name=self.index_name,
                                 credential=self._acs_credential)
    self._search_client = search_client

    # Issue - it's required to know which embeddings have to be used for indexing (so that
    # queries can be embedded in the same way). TBD if the search_client obj has that information.
    embedding_type = "openai" # hardcoding for not - TODO
    if embedding_type.replace(' ','').lower()=='huggingface':
      embeddings = HuggingFaceEmbeddings()
    elif embedding_type.replace(' ','').lower()=='openai':
      embeddings = OpenAIEmbeddings(
        deployment=deployment_id_ada,
        model="text-embedding-ada-002",
        openai_api_key=openai_api_key,
        chunk_size=1
      )
    self._embeddings = embeddings # required for search methods later on

    # probably not required, just to populate all internal attributes
    index_client = SearchIndexClient(endpoint=self._acs_endpoint, credential=self._acs_credential)
    self._index_client = index_client

    return self
  
  def search(self, query, k=5, mode="text"):
    query_embedded = self._embeddings.embed_query(query)
    try:
      if mode == "text":
        results = self._search_client.search(  
          search_text=query,
          filter=f"group_ids eq '{self.group_id}'", #filter query based on group_id
          select=["title", "content", "group_ids"],
          top=k
        )
      elif mode == "vector":
        query_embedded = self._embeddings.embed_query(query)
        results = self._search_client.search(  
            search_text=None,
            filter=f"group_ids eq '{self.group_id}'", #filter query based on group_id
            vectors=[Vector(value=query_embedded, k=k, fields="content_vector")], #vector=Vector(value=query_embedded, k=k, fields="content_vector"),  
            select=["title", "content", "group_ids"] 
        )
      elif mode == "hybrid":
        query_embedded = self._embeddings.embed_query(query)
        results = self._search_client.search(  
          search_text=query,
          filter=f"group_ids eq '{self.group_id}'", #filter query based on group_id
          vectors=[Vector(value=query_embedded, k=k, fields="content_vector")], #vector=Vector(value=query_embedded, k=k, fields="content_vector"),  
          select=["title", "content", "group_ids"],
          top=k
        )
      else:
        print("Unsupported search mode. Use either 'text', 'vector', 'hybrid'.")
        return ""
      content_list_text = [val['content'] for val in results]
      return content_list_text
    except Exception as err:
      print("Error in searching ACS index. Returning empty string (''). Traceback: " + str(err))
      raise err #DEBUG
      return ""
    
  def search_and_summarize(self, search_term, k=3, mode="text"):
    res = "None"
    try:
    
      # Get initial results from whoosh search
      search_results = self.search(search_term, k=k, mode=mode)

      # avoid index errors (e.g. when we specify top 5, but whoosh only retrieves 3 docs)
      if len(search_results) < k:
        k = len(search_results)
      
      # concatenate top k results from the whoosh search in plain text
      top_k_docs = search_results[:k]

      # have GPT summarize the retrieved documents (with a focus on the search keywords)
      summary_prompt = f"Answer the following question given the below documents. The question: {str(search_term)} The documents: {str(top_k_docs)}"
      # res = gpt4(summary_prompt, large=True, max_tokens=10000, tries=3, temperature=.2)
      res = gpt35(prompt=summary_prompt, temperature=0.2, max_tokens=4000, tries=3)

    except Exception as err:
      log("[ACS Retrieval] Error in searching documents using ACS & GPT. Returning 'None'. Traceback: ")
      print(err)

    finally:
      return res

In [0]:
class DocumentIndexACS():

  def __init__(self, acs_endpoint, acs_key):
    ### Store endpoints, credentials, config variables
    self._acs_endpoint = acs_endpoint
    self._acs_key = acs_key
    self._acs_credential = AzureKeyCredential(self._acs_key)

  def from_documents(self, documents, index_name, embedding_type='OpenAI'):

    ### Create index on ACS endpoint
    self.index_name = index_name # for reference
    log("[ACS Index] Creating the index on Azure Cognitive Search ...")
    index_client = SearchIndexClient(endpoint=self._acs_endpoint, credential=self._acs_credential)
    self._index_client = index_client # required for index creation later on

    # delete if exists
    if index_name in list(self._index_client.list_index_names()):
      log(f"[ACS Index] Dropping existing index '{index_name}' ...")
      self._index_client.delete_index(index_name)
    
    # define index fields (default)
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="title", type=SearchFieldDataType.String,
                        searchable=True),
        SearchableField(name="content", type=SearchFieldDataType.String,
                        searchable=True),
        SearchableField(name="category", type=SearchFieldDataType.String,
                        filterable=True, searchable=True),
        SimpleField(name="metadata", type=SearchFieldDataType.String),
        SearchField(name="title_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, dimensions=1536, vector_search_configuration="my-vector-config"),
        SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, dimensions=1536, vector_search_configuration="my-vector-config"),
    ]

    # define vector search config
    vector_search = VectorSearch(
        algorithm_configurations=[
            VectorSearchAlgorithmConfiguration(
                name="my-vector-config",
                kind="hnsw",
                hnsw_parameters={
                    "m": 10,
                    "efConstruction": 400,
                    "efSearch": 500,
                    "metric": "cosine"
                }
            )
        ]
    )

    # define semantic search config
    semantic_config = SemanticConfiguration(
        name="my-semantic-config",
        prioritized_fields=PrioritizedFields(
            title_field=SemanticField(field_name="title"),
            prioritized_keywords_fields=[SemanticField(field_name="category")],
            prioritized_content_fields=[SemanticField(field_name="content")]
        )
    )

    # Create the semantic settings with the configuration
    semantic_settings = SemanticSettings(configurations=[semantic_config])

    # Create the search index with the semantic settings
    index = SearchIndex(name=index_name,
                        fields=fields,
                        vector_search=vector_search,
                        semantic_settings=semantic_settings)
    result = self._index_client.create_or_update_index(index)
    log(f"[ACS Index] Index '{result.name}' created on Azure Cognitive Search.")

    ### Create embeddings
    if embedding_type.replace(' ','').lower()=='huggingface':
      log(f"[ACS Index] Generating embeddings (HuggingFace) ...")
      embeddings = HuggingFaceEmbeddings()
    elif embedding_type.replace(' ','').lower()=='openai':
      log(f"[ACS Index] Generating embeddings (OpenAI ADA2) ...")
      embeddings = OpenAIEmbeddings(
        deployment=deployment_id_ada,
        model="text-embedding-ada-002",
        openai_api_key=openai_api_key,
        chunk_size=1
      )
    self._embeddings = embeddings # required for search methods later on

    ### Embed documents
    log("[ACS Index] Embedding the documents ...")
    embedded_documents = embeddings.embed_documents([doc.page_content for doc in documents])

    ### Create structure for ACS upload
    log("[ACS Index] Generate structure for Azure Cognitive Search upload ...")
    acs_dict = []
    for i, doc in enumerate(documents):
      acs_dict.append({
          'id': str(i), # doc.metadata["seq_num"],
          'title': doc.metadata["source"],
          'category':'',
          'content': doc.page_content,
          'content_vector': embedded_documents[i],
          "@search.action": "upload"
      })

    ### Upload to ACS
    log("[ACS Index] Uploading the embedded documents to Azure Cognitive Search ...")
    search_client = SearchClient(endpoint=self._acs_endpoint,
                                 index_name=self.index_name,
                                 credential=self._acs_credential)
    self._search_client = search_client

    result = self._search_client.upload_documents(acs_dict)  
    log(f"[ACS Index] Uploaded {len(acs_dict)} documents")
    return self

  def load(self, index_name):
    self.index_name = index_name
    search_client = SearchClient(endpoint=self._acs_endpoint,
                                 index_name=self.index_name,
                                 credential=self._acs_credential)
    self._search_client = search_client

    # Issue - it's required to know which embeddings have to be used for indexing (so that
    # queries can be embedded in the same way). TBD if the search_client obj has that information.
    embedding_type = "openai" # hardcoding for not - TODO
    if embedding_type.replace(' ','').lower()=='huggingface':
      embeddings = HuggingFaceEmbeddings()
    elif embedding_type.replace(' ','').lower()=='openai':
      embeddings = OpenAIEmbeddings(
        deployment=deployment_id_ada,
        model="text-embedding-ada-002",
        openai_api_key=openai_api_key,
        chunk_size=1
      )
    self._embeddings = embeddings # required for search methods later on

    # probably not required, just to populate all internal attributes
    index_client = SearchIndexClient(endpoint=self._acs_endpoint, credential=self._acs_credential)
    self._index_client = index_client

    return self
  
  def search(self, query, k=5, mode="text"):
    query_embedded = self._embeddings.embed_query(query)
    try:
      if mode == "text":
        results = self._search_client.search(  
          search_text=query,   
          select=["title", "content", "category"],
          top=k
        )
      elif mode == "vector":
        query_embedded = self._embeddings.embed_query(query)
        results = self._search_client.search(  
            search_text=None,  
            vector=Vector(value=query_embedded, k=k, fields="content_vector"),  
            select=["title", "content", "category"] 
        )
      elif mode == "hybrid":
        query_embedded = self._embeddings.embed_query(query)
        results = self._search_client.search(  
          search_text=query,  
          vector=Vector(value=query_embedded, k=k, fields="content_vector"),  
          select=["title", "content", "category"],
          top=k
        )
      else:
        print("Unsupported search mode. Use either 'text', 'vector', 'hybrid'.")
        return ""
      content_list_text = [val['content'] for val in results]
      return content_list_text
    except Exception as err:
      print("Error in searching ACS index. Returning empty string (''). Traceback: " + str(err))
      return ""
    
  def search_and_summarize(self, search_term, k=3, mode="text"):
    res = "None"
    try:
    
      # Get initial results from whoosh search
      search_results = self.search(search_term, k=k, mode=mode)

      # avoid index errors (e.g. when we specify top 5, but whoosh only retrieves 3 docs)
      if len(search_results) < k:
        k = len(search_results)
      
      # concatenate top k results from the whoosh search in plain text
      top_k_docs = search_results[:k]

      # have GPT summarize the retrieved documents (with a focus on the search keywords)
      summary_prompt = f"""###RequirementStart:
      {search_term}
      ###RequirementEnd

      What detailed granular information from the documents below can be leveraged to build the above requirement? Response should only be each document name followed by relevant info. Elaborate as much as possible. Documents:
      '''{top_k_docs}'''
      """
      res = gpt4(summary_prompt, large=True, max_tokens=10000, tries=3, temperature=.2)

    except Exception as err:
      log("[ACS Retrieval] Error in searching documents using ACS & GPT. Returning 'None'. Traceback: ")
      print(err)

    finally:
      return res
    
## Could also be loaded via ix = index.open_dir("indexdir")

#### Retrieve indexes

In [0]:
def read_index_func(index_dir,index_name,llm,chain_type="stuff"):

  ## Create folder structure if not already present
  if not os.path.exists(index_dir):
      os.makedirs(index_dir)

  db_ws.CopyFileFromWorkbench("file:"+index_dir, filename=index_name)

  with open(os.path.join(index_dir,index_name), "rb") as f:
    db_fdd = pickle.load(f)

  index_fdd= RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=db_fdd.as_retriever())
  return index_fdd, db_fdd