#### Imports

In [0]:
%run "./SAP_FDD_Env_and_Utils"

In [0]:
%run "./SAP_FDD_Indexing"

### Generate Indexes for FSDs

In [0]:
fsds = read_documents(zip_filename='FSD',
                      zip_dir='/SAP_KG/',
                      blacklist_docs=['PM_PM_TSD_I552_Notification Created Message.docx'])

In [0]:
fsds[0]

In [0]:
fsds_chunked = chunk_documents(fsds, chunk_size=5000, chunk_overlap=1000, heading=True)

In [0]:
# index_fsd_faiss = create_embedded_index(fsds_chunked,
#                                         index_dir = '/tmp/SAP/',
#                                         index_name = "indexstore_FSD_OpenAI_v1_chunked.pkl",
#                                         embedding_type = 'OpenAI',
#                                         index_type = 'FAISS',
#                                         save_to_workbench = True)

In [0]:
# index_fsd_whoosh = DocumentIndexWhoosh().from_documents(fsds_chunked, index_path_out="/tmp/SAP/index_fsd_whoosh")

In [0]:
# index_fsd_combined = CombinedIndex(index_fsd_faiss, index_fsd_whoosh)

### Generate indexes for BPDs

In [0]:
bpds = read_documents(zip_filename='BPD',
                      zip_dir='/SAP_KG/')

In [0]:
bpds_chunked = chunk_documents(bpds, chunk_size=5000, chunk_overlap=1000, heading=True)

In [0]:
# index_bpd_faiss = create_embedded_index(bpds_chunked,
#                                         index_dir = '/tmp/SAP/',
#                                         index_name = 'indexstore_BPD_OpenAI_v3_chunked.pkl',
#                                         embedding_type = 'OpenAI',
#                                         index_type = 'FAISS',
#                                         save_to_workbench = True)

In [0]:
# index_bpd_whoosh = DocumentIndexWhoosh().from_documents(bpds_chunked, index_path_out="/tmp/SAP/index_bpd_whoosh")

In [0]:
# index_bpd_combined = CombinedIndex(index_bpd_faiss, index_bpd_whoosh)

### Generate indexes for Transcripts

In [0]:
transcripts = read_documents_transcripts(doc_path="/tmp/Input_SAP/cleansed_transcript_new1.txt", combine=True, heading=True)

In [0]:
transcripts_chunked = chunk_documents(transcripts, chunk_size=5000, chunk_overlap=1000)

In [0]:
# index_transcripts_faiss = create_embedded_index(transcripts_chunked,
#                                         index_dir = '/tmp/SAP/',
#                                         index_name = 'indexstore_Transcripts_OpenAI_v6_chunked.pkl',
#                                         embedding_type = 'OpenAI',
#                                         index_type = 'FAISS',
#                                         save_to_workbench = False)

In [0]:
#index_transcripts_whoosh = DocumentIndexWhoosh().from_documents(transcripts_chunked, index_path_out="/tmp/SAP/index_transcripts_whoosh")
# Alternative:
index_transcripts_whoosh = DocumentIndexWhoosh().load("/tmp/SAP/index_transcripts_whoosh")

In [0]:
index_transcripts_combined = CombinedIndex(index_transcripts_faiss, index_transcripts_whoosh)

### Azure Cognitive Search Test

In [0]:
index_transcripts_acs = DocumentIndexACS_v2(ACS_ENDPOINT, ACS_KEY, "uc-sap-index-transcripts")\
  .from_documents(transcripts_chunked)

In [0]:
index_fsd_acs = DocumentIndexACS_v2(ACS_ENDPOINT, ACS_KEY, "uc-sap-index-fsd")\
  .from_documents(fsds_chunked[:1000])

In [0]:
index_bpd_acs = DocumentIndexACS_v2(ACS_ENDPOINT, ACS_KEY, "uc-sap-index-bpd")\
  .from_documents(bpds_chunked)

### Test indexes

In [0]:
res = index_transcripts_acs.search_and_summarize("ArcGIS", mode="hybrid")
print(res)

In [0]:
res = index_fsd_combined.search_and_summarize("ArcGIS", verbose=True)
print(res)

In [0]:
res = index_bpd_combined.search_and_summarize("ArcGIS", verbose=True)
print(res)

In [0]:
res = index_transcripts_combined.search_and_summarize("ArcGIS", verbose=True)
print(res)

In [0]:
index_transcripts_whoosh.search("ArcGIS")

In [0]:
index = DocumentIndexACS(ACS_ENDPOINT, ACS_KEY).load("uc-sap-index-fsd")
list(index._index_client.list_index_names())

In [0]:
help(index._index_client)

In [0]:
help(index._search_client)

In [0]:
index._search_client.get_document("1")

In [0]:
for i in range(1, 100):
  doc = index._search_client.get_document(str(i))
  if doc["title"] == "Title":
    # index._search_client.delete_documents([{"title": "Title"}])
    print("x")

In [0]:
index._search_client.get_document_count()

In [0]:
index._search_client.delete_documents([{"title": "Title"}])

In [0]:
len(list(index._search_client.search("")))

### Generate Index for Images

In [0]:
# import zipfile
# import os
# from databricks_sdk_v2.databricks_ws import *
# import requests
# import json
# import time
# import random
# from IPython.display import Image 
# from IPython.core.display import HTML
# import pandas as pd
# import re
# import shutil
# from langchain.embeddings import OpenAIEmbeddings
# from langchain.document_loaders import DirectoryLoader
# from langchain.text_splitter import TokenTextSplitter

import docx2txt
import azure.ai.vision as sdk

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from langchain.retrievers import AzureCognitiveSearchRetriever
from langchain.document_loaders import TextLoader
from langchain.vectorstores import AzureSearch

In [0]:
filename_zip = 'BPD.zip'
Folder1 = '/tmp/bpd/'
zip_file_path = Folder1 + filename_zip
output_directory = '/tmp/bpd_docs/'
parent_directory_name = '/tmp/bpd_images'

In [0]:
def extract_docx_files(zip_file, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    docx_files = []
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        for file_info in zip_ref.infolist():
            if file_info.filename.lower().endswith(".docx"):
                zip_ref.extract(file_info.filename, output_directory)
                docx_files.append(os.path.join(output_directory, file_info.filename))
    return docx_files
  
def flatten(directory):
  for dirpath, _, filenames in os.walk(directory, topdown=False):
      for filename in filenames:
          i = 0
          source = os.path.join(dirpath, filename)
          target = os.path.join(directory, filename)
          while os.path.exists(target):
              i += 1
              file_parts = os.path.splitext(os.path.basename(filename))
              target = os.path.join(
                  directory,
                  file_parts[0] + "_" + str(i) + file_parts[1],
              )
          shutil.move(source, target)
          print("Moved ", source, " to ", target)
      if dirpath != directory:
          os.rmdir(dirpath)
          print("Deleted ", dirpath)

def create_directories(parent_directory):
    try:
        os.makedirs(parent_directory)
        print(f"Parent directory '{parent_directory}' created successfully.")
    except OSError as e:
        print(f"Error creating parent directory '{parent_directory}': {e}")
        return
    for i in range(len(docx_files_underscore)):
        dir_name = os.path.join(parent_directory, f"dir_{i}")
        try:
            os.makedirs(dir_name)
            print(f"Subdirectory '{dir_name}' created successfully.")
        except OSError as e:
            print(f"Error creating subdirectory '{dir_name}': {e}")

In [0]:
if not os.path.exists(Folder1):
    os.makedirs(Folder1)
db_ws.CopyFileFromWorkbench(DatabricksFolder="file:"+Folder1, filename=filename_zip)

In [0]:
docx_files = extract_docx_files(zip_file_path, output_directory)

In [0]:
docx_files_base = []
for i in range(len(docx_files)):
  docx_files_base.append(os.path.basename(docx_files[i]))

docx_files_underscore = []
for i in range(len(docx_files_base)):
  docx_files_underscore.append(docx_files_base[i].replace(" ", "_"))

In [0]:
shutil.rmtree(parent_directory_name,ignore_errors=True)
create_directories(parent_directory_name)

In [0]:
for i in range(len(docx_files)):
  path = parent_directory_name + '/' + f"dir_{i}" + '/'
  docx2txt.process(docx_files[i], path)
  for f in os.listdir(path):
    name,ext = f.split('.')
    os.rename(f'{path}/{f}', f'{path}/{docx_files_underscore[i]}_{f}')
flatten(parent_directory_name + "/")

In [0]:
service_options = sdk.VisionServiceOptions(VISION_ENDPOINT, VISION_KEY)
analysis_options = sdk.ImageAnalysisOptions()
analysis_options.features = (
        sdk.ImageAnalysisFeature.CAPTION |
        sdk.ImageAnalysisFeature.TEXT |
        sdk.ImageAnalysisFeature.OBJECTS
    )

analysis_options.language = "en"
analysis_options.gender_neutral_caption = True                                           
image_dir = parent_directory_name # "/tmp/bpd_images_1/"
images = [file for file in os.listdir(image_dir) if file.endswith('.jpg') or file.endswith('.png')]
result_list= [] 
files_list = []
description_list= []
df_images=pd.DataFrame()
df_images['image']= pd.Series(images)
# Loop through images and analyze each one
for image in images:
  result_string = image.split('.', 1)[0] + '.docx'
  files_list.append(result_string)
  # Construct image path
  img_path = os.path.join(image_dir, image)
  vision_source = sdk.VisionSource(filename=img_path)
  image_analyzer = sdk.ImageAnalyzer(service_options, vision_source, analysis_options)
  result = image_analyzer.analyze()
  result_list.append(str(result.text))
  if str(result.text)=='None':
    print(image)
  time.sleep(1)
df_images['polygon']=pd.Series(result_list)
df_images['file_name'] = pd.Series(files_list)

In [0]:
#Function for removing nested words

def extract_detected_text_word(text):
    # Use regex to find all occurrences of 'DetectedTextWord(content=...)'
    detected_text_word_pattern = r'DetectedTextLine\(content=(.*?), bounding_polygon=\[(.*?)\]'
    detected_text_words = re.findall(detected_text_word_pattern, text)

    # Generate the formatted output for each DetectedTextWord
    formatted_text_words = []
    for word_content, bounding_polygon in detected_text_words:
        formatted_text_words.append(f"""DetectedTextLine(content={word_content}, bounding_polygon=[{bounding_polygon}])""")

    # Join the formatted DetectedTextWord parts into a single string
    formatted_text = 'DetectedText(lines=['+', '.join(formatted_text_words)+'])'
       # Check token count
    token_count = len(re.findall(r'\w+', formatted_text))
    if token_count > 10000:
        raise ValueError("Description exceeds the token limit of 10000")


    return formatted_text


In [0]:
index_name = 'image_analysis.xlsx'
folder_path = '/tmp/SAP/'
directory_path_txt = 'tmp/txt_files'

In [0]:
metadata = df['polygon'].tolist()
# Apply the extract_detected_text_word function to the 'polygons' column

df_images['formatted_polygons'] = df.apply(lambda row: extract_detected_text_word(str(row['polygon'])), axis = 1)
df_images['token count']= df.apply(lambda row : count_tokens(str(row['polygon']) ), axis = 1)

display(df_images)


In [0]:
description_list=[]

for i in range(len(df_images)):
  # gptoutout= (gpt4(prompt= "Provide detailed granular description of the following image polygon, capturing every single detail in the image, return description only in response: " + str(df_images.loc[i,'formatted_polygons']), context= "You are an expert at providing detailed descriptions based on image polygons.", large = True, max_tokens= 5000))
  gptoutout= (gpt35(prompt= "Provide detailed granular description of the following image polygon, capturing every single detail in the image, return description only in response: " + str(df_images.loc[i,'formatted_polygons']), context= "You are an expert at providing detailed descriptions based on image polygons.", large = True, max_tokens= 2000))
  description_list.append(gptoutout)
  time.sleep(5)
  print(i)

  
df_images['cleansed_descriptions']=pd.Series(description_list)

In [0]:
df_images

In [0]:
#Save the DataFrame to an Excel file within the folder in Databricks
index_name = "image_analysis_bpd.xlsx"
output_file = os.path.join(folder_path, index_name)
with pd.ExcelWriter(output_file, engine="openpyxl",) as writer:
    df_images.to_excel(writer, sheet_name="Sheet1", index=True)
db_ws.CopyFileToWorkbench(filename=index_name, DatabricksFolder="file:"+folder_path)

In [0]:
directory_path_txt = "/tmp/SAP/images_txts/"

cleansed_descrips = df_images['cleansed_descriptions']
image_names = df_images['image']
cleansed_list = cleansed_descrips.tolist()
image_names_list = image_names.tolist()
image_and_description = []
for i in range(len(cleansed_list)):
  image_and_description.append(f'{image_names_list[i]}: {cleansed_list[i]}')

shutil.rmtree(directory_path_txt,ignore_errors=True)
os.mkdir(directory_path_txt)
directory_path = directory_path_txt
for i, string_content in enumerate(image_and_description):
    file_path = os.path.join(directory_path, f'{image_names_list[i]}.txt')
    with open(file_path, 'w') as file:
        file.write(str(string_content))

docs = []
for i in range(len(image_and_description)):
  docs.append(f'{directory_path_txt}/{image_names_list[i]}.txt')

In [0]:
## Create ACS index
images_docs = DirectoryLoader(directory_path_txt).load()
# images_chunked = chunk_documents(images_docs, chunk_size=5000, chunk_overlap=1000) # probably not needed, .txt files are small enough
index_images_acs = DocumentIndexACS_v2(ACS_ENDPOINT, ACS_KEY, "uc-sap-index-images").from_documents(images_docs)

### Playground

In [0]:
%run "./SAP_FDD_Indexing"

In [0]:
d = DocumentIndexACS_v2(ACS_ENDPOINT, ACS_KEY, "uc-sap-test")

In [0]:
d.add_documents([doc])

In [0]:
d.clear()

In [0]:
d.from_documents([doc])

In [0]:
d.search("text1", mode="hybrid")

In [0]:
ACS_ENDPOINT

In [0]:
x = DocumentIndexACS(ACS_ENDPOINT, ACS_KEY)
y = SearchIndexClient(endpoint=x._acs_endpoint, credential=x._acs_credential)
list(y.list_index_names())