##### 1. Imports

In [0]:
import json
import requests
import os
import pathlib
from pathlib import Path
import re
import time
import uuid
import pickle
import ast
import tiktoken
import openai
import numpy as np
import pandas as pd
from math import ceil
from copy import deepcopy
from itertools import zip_longest
from itertools import chain as itertools_chain
from docx import Document 
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.text.paragraph import Paragraph
from docx.oxml.text.paragraph import CT_P
import unstructured
from unstructured import *
from unstructured.partition.auto import partition
from zipfile import ZipFile
import shutil
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
# import beautifulsoup4 as bs4

import whoosh
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser, OrGroup, FuzzyTermPlugin
from whoosh.analysis import StemmingAnalyzer

from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage,
    BaseMessage,
    ChatGeneration,
    ChatResult,
)
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader, CSVLoader, JSONLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import LLMChain, ConversationChain, SimpleSequentialChain, SequentialChain # import LangChain libraries
from langchain.llms import OpenAI # import OpenAI model
from langchain.prompts import PromptTemplate # import PromptTemplate
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain import VectorDBQA
from langchain.vectorstores import Chroma

import logging
from azure.storage.blob import BlobServiceClient
import io
logger = spark._jvm.org.apache.log4j
logging.getLogger("py4j.java_gateway").setLevel(logging.ERROR)

from typing import List, Optional, Tuple, Dict, Any

from datetime import date
from datetime import datetime

import sentence_transformers 
from sentence_transformers import SentenceTransformer, util

from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    # VectorSearchAlgorithmConfiguration,  
) 

##### 2. Databricks & OpenAI Contexts

In [0]:
# from databricks_sdk_v2.databricks_ws import *
from databricks_sdk.databricks_ws import *
db_ws = DatabricksWS(dbutils, spark, display)

In [0]:
#########################################
#           OpenAI API Setup            #
#########################################

# Get Cluster name and set it to scope variable
full_cluster_name = spark.conf.get("spark.databricks.clusterUsageTags.clusterName")

cluster_scope = full_cluster_name[:full_cluster_name.rfind("_")]

# The OpenAI API key and endpoint URL is currently hosted in an Azure Key Vault
api_secret = dbutils.secrets.get(scope = cluster_scope, key = "OPENAI_API_KEY")
api_url = dbutils.secrets.get(scope = cluster_scope, key = "OPENAI_URL")
api_header = {"api-key": api_secret}

### Get deployment
# Structuring the Deployment API endpoint
deployments_url = f"{api_url}/openai/deployments?api-version=2023-03-15-preview"

# Call GET List of Deployments
deployment_response = requests.get(deployments_url, headers=api_header)

# We also have a helper function to search through deployments for a specific model deployment_id
def getSpecificModel(deployments, model):
    for val in deployments['data']:
        if(val['model'] == model):
            return val['id']

### Get Parameters for different models
deployment_id_gpt4 = getSpecificModel(deployment_response.json(), 'gpt-4')
deployment_id_gpt35 = getSpecificModel(deployment_response.json(), 'gpt-35-turbo')
deployment_id_gpt4_32k = getSpecificModel(deployment_response.json(), 'gpt-4-32k')
deployment_id_ada = getSpecificModel(deployment_response.json(), "text-embedding-ada-002")
deployment_id_gpt35_16k = getSpecificModel(deployment_response.json(), 'gpt-35-turbo-16k')

### Structuring the Deployment API endpoint
url_gpt4 = f"{api_url}/openai/deployments/{deployment_id_gpt4}/chat/completions?api-version=2023-03-15-preview"
url_gpt35 = f"{api_url}/openai/deployments/{deployment_id_gpt35}/chat/completions?api-version=2023-03-15-preview"
url_gpt4_32k = f"{api_url}/openai/deployments/{deployment_id_gpt4_32k}/chat/completions?api-version=2023-03-15-preview"
url_ada = f"{api_url}/openai/deployments/{deployment_id_ada}/chat/completions?api-version=2023-03-15-preview"
url_gpt35_16k = f"{api_url}/openai/deployments/{deployment_id_gpt35}/chat/completions?api-version=2023-03-15-preview"

In [0]:
class PwCAzureChatOpenAI(AzureChatOpenAI):
    def _create_prompt(
        self, messages: List[BaseMessage], stop: Optional[List[str]]
    ) -> Tuple[str, Dict[str, Any]]:
        params: Dict[str, Any] = {
            **{"model": self.model_name, "engine": self.deployment_name},
            **self._default_params,
        }
 
        params["stop"] = ["<|im_end|>"]
        prompt = _create_chat_prompt(messages)
        return prompt, params

In [0]:
gpt4_32k_model_name = "gpt-4-32k"
gpt35_16k_model_name = "gpt-35-turbo-16k"
embed_model_name = "text-embedding-ada-002"
llm_max_tokens = 20000
temperature = 0.0

openai_api_base = api_url
openai_api_key = api_secret
openai_api_type = "azure"
openai_api_version = "2023-03-15-preview"

openai.api_base = openai_api_base
openai.api_key = openai_api_key
openai.api_type = openai_api_type
openai.api_version = openai_api_version
 
os.environ["OPENAI_API_BASE"] = openai_api_base
os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["OPENAI_API_TYPE"] = openai_api_type
os.environ["OPENAI_API_VERSION"] = openai_api_version

llm = PwCAzureChatOpenAI(deployment_name= getSpecificModel(deployment_response.json(), gpt4_32k_model_name),
                         model_name=gpt4_32k_model_name,temperature=temperature,max_tokens=llm_max_tokens,engine=gpt4_32k_model_name,
                         openai_api_key = openai_api_key,openai_api_base = openai_api_base,openai_api_version = openai_api_version)

# llm_35_16k = PwCAzureChatOpenAI(deployment_name = getSpecificModel(deployment_response.json(), gpt35_16k_model_name),
#                          model_name=gpt35_16k_model_name,temperature=temperature,max_tokens=llm_max_tokens,engine=gpt35_16k_model_name,
#                          openai_api_key = openai_api_key,openai_api_base = openai_api_base,openai_api_version = openai_api_version)

##### ACS Authentication and Context

In [0]:
ACS_KEY = "vO7O3aspEXtj2qeFTOKl2rLSTNcijSVDN5JNjUV5qxAzSeD0om1d"
ACS_NAME = "u2zeapebsdse001"
ACS_ENDPOINT = f"https://{ACS_NAME}.search.windows.net"
ACS_CREDENTIAL = AzureKeyCredential(ACS_KEY)

os.environ["AZURE_COGNITIVE_SEARCH_SERVICE_NAME"] = ACS_NAME
os.environ["AZURE_COGNITIVE_SEARCH_API_KEY"] = ACS_KEY

##### Azure Vision Authentication and Context

In [0]:
VISION_ENDPOINT = 'https://testimageanalysis.cognitiveservices.azure.com/'
VISION_KEY = 'f920c617e81547c290e94973be55258b'

##### 3. Helper Functions

In [0]:
def print_red(string, end="\n"):
  print(f"\x1b[31m{string}\x1b[0m", end=end)
def print_green(string, end="\n"):
  print(f"\x1b[32m{string}\x1b[0m", end=end)
def print_blue(string, end="\n"):
  print(f"\x1b[36m{string}\x1b[0m", end=end)
def print_bold(string, end="\n"):
  print(f"\x1b[1m{string}\x1b[0m", end=end)

def log(msg, color=None, end="\n"):
  print_colored = print
  if color == "red": print_colored = print_red
  elif color == "green": print_colored = print_green
  elif color == "blue": print_colored = print_blue
  elif color == "bold": print_colored = print_bold
  print_colored(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}", end=end)

# Displays image from local file system (using matplotlib as a workaround, nothing else worked)
def print_img(img_path):
  image = mpimg.imread(img_path)
  plt.imshow(image)
  plt.show()


## Might be useful for GPT - returns an enumerated string for a list, i.e.
# 1. X
# 2. Y
# ...
# n. Z
def enumerate_str(l):
  res_str = ""
  for index, elem in enumerate(l):
    res_str += f"{index}. {elem} \n"
  return res_str

def create_directories(kg_structure):
  try:
    Path('/SAP_KG').mkdir(parents=True, exist_ok=True)
  except:
    pass

  try:
    Path('/SAP_KG/Indexes').mkdir(parents=True, exist_ok=True)
  except:
    pass

  for node in kg_structure:
    try:
      Path('/SAP_KG/' + node).mkdir(parents=True, exist_ok=True)
      Path('/SAP_KG/Indexes/' + node).mkdir(parents=True, exist_ok=True)
    except:
      continue

def unzip(file_name, verbose=False):
    try:     
        os.mkdir('/Team_one_Outputs')
        os.mkdir('/Team_one_Outputs/RFP_Samples')
        os.mkdir('/Team_one_SummerizeAndSearch')
        os.mkdir('/Team_one_SummerizeAndSearchUnzipped')
        if verbose == True:
          print(file_name+ "directories successfully created")
    except:
        if verbose == True:
          print(filename+" directory creation failed")
        
    local_save_folder = 'file:/Team_one_SummerizeAndSearch'
    raw_file = db_ws.ListFiles()
    raw_file_list=[]
    for fil_obj in raw_file:
        raw_file_list.append(fil_obj.name)

    ws_file = os.path.join(file_name)
    db_ws.CopyFileFromWorkbench(DatabricksFolder=local_save_folder,filename=ws_file)

    path = '/Team_one_SummerizeAndSearch/'+file_name
    try:
      with ZipFile(path, 'r') as zObject:
          # Extracting all the files of the zip into a temporary folder
          zObject.extractall(path="/Team_one_SummerizeAndSearchUnzipped")
          list = zObject.namelist()
      print("all files successfully extracted")
    except:
      print("file extraction failed")
    return list


# def convert(filename,output_path='/Team_one_Outputs/', verbose=False, for_validation = False):
def convert(filename,output_path='/tmp/SAP', verbose=False, for_validation = False):
    filetype = filename.split('.')[-1]
    if filetype not in ['pdf','pptx','doc','docx']:
        return ''
#   if filetype == 'xlsx':
#     convert_xls()
#   if filetype == 'csv':
#     convert_csv()
    if verbose == True:
        print(filename+' conversion started...')
    if for_validation == True:
      elements = partition(filename="/tmp/SAP/"+filename)
    else:
      elements = partition(filename="/Team_one_SummerizeAndSearchUnzipped/"+filename)
    # text=''
    # for el in elements:
    #   if isinstance(el,unstructured.documents.elements.Title):
    #     text=text+'<chunkend>\n\n'+str(el)
    #   else:
    #     text=text+'\n\n'+str(el)
    # text=text.lstrip('\n\n').lstrip('<chunkend>\n\n')+'<chunkend>'
    text = "\n\n".join([str(el) for el in elements])
    filename = filename[:filename.rfind(".")]
    filename = filename.split('/')[-1]
    if verbose == True:
        print(filename+' conversion ended...')
    try :
        f = open( output_path + filename +'.txt' , "x" )
    except :
        f = open( output_path + filename +'.txt' , "w+" )
    f.write(text)
    f.close()
    return text

def selectFromDict(options, name):
  index = 0
  indexValidList = []
  print('Select a ' + name + ':')
  for optionName in options:
      index = index + 1
      indexValidList.extend([options[optionName]])
      print(str(index) + '. ' + optionName)
  inputValid = False
  while not inputValid:
      inputRaw = input(name + ': ')
      inputNo = int(inputRaw) - 1
      if inputNo > -1 and inputNo < len(indexValidList):
          selected = indexValidList[inputNo]
          print('Selected ' +  name + ': ' + selected)
          inputValid = True
          break
      else:
          print('Please select a valid ' + name + ' number')

  return selected

def unzip_and_convert(kg_structure,directory,black_list_docs=[],unzip_verbose=False,convert_verbose=False):
  for node in kg_structure:
    list = unzip(node + '.zip',verbose=unzip_verbose)
    for name in list:
      if name not in black_list_docs:
        text = convert(name, directory + node + "/",verbose=convert_verbose)

##### 4. Functions For Querying OpenAI / GPT

In [0]:
#########################################
#         Core Functions for GPT        #
#########################################

### Static functions and utils

## Query GPT-4
def gpt4(prompt:str, context:str = "You are a helpful assistant.", temperature:float = 0.0, max_tokens:int = 1000, details:bool = False, large:bool = False, tries:int=1, quiet=True):
  """
  Queries GPT-4 with the specified prompt and parameters.
  Example: gpt4("Hello") will return GPT-4's answer: "Hello! How can I assist you today?"
  
  Keyword arguments:
  prompt      -- Prompt for GPT-4. Required.
  context     -- System context for GPT-4. Optional, default 'You are a helpful assistant.'.
  temperature -- Temperature parameter for GPT-4. Optional, default 0.
  max_tokens  -- Max Token Parameter for GPT-4. Optional, default 1000.
  details     -- Whether or not the full API response (incl. headers etc.) or just the model's answer will be returned. Optional, default False.

  Output: The response from the GPT-4 API. Depending on the 'details' parameter, either the full API response (list/dict) or just GPT's answer (string).
  """
  # url = url_gpt4_32k if large else url_gpt4
  url = url_gpt4
  headers = api_header
  data = {
    "messages":[
      {"role": "system", "content": context},
      {"role": "user", "content": prompt+"\n"}
    ],
    "temperature": temperature, 
    "max_tokens": max_tokens
  }
  # Call POST to run Completion on a deployment
  for i in range(tries):
    response = None
    try:
      response = requests.post(url, headers=headers, json=data)
      print(response.json())
    except Exception as err:
      if not quiet: print_bold("ERROR - Error in GPT response. No response generated. Retrying in 60 seconds.")
      if not quiet: print(err)
      if i < tries-1:
        time.sleep(60)
      continue
    try:
      return response.json()['choices'][0]['message']['content'] if details == False else response.json()
    except Exception as err:
      sleeptime = 15 # default, wait 15 seconds before retry
      if not quiet: print_bold("ERROR - Error in GPT response. Full response:")
      if not quiet: print(response.text)
      if i < tries-1:
        search = re.search(r"Please retry after ([\d]*) seconds", response.text)
        if search: sleeptime = int(search.group(1))
        if not quiet: print(f"Trying again in {sleeptime} seconds.")
        time.sleep(sleeptime)
      continue

## Query GPT-3.5
def gpt35(prompt:str, context:str = "You are a helpful assistant.", temperature:float = 0.0, max_tokens:int = 1000, details:bool = False, large= False, tries:int=1):
  """
  Queries GPT-3.5 (Turbo) with the specified prompt and parameters.
  Example: gpt35("Hello") will return GPT-3.5's answer: "Hello! How can I assist you today?"
  
  Keyword arguments:
  prompt      -- Prompt for GPT-3.5. Required.
  context     -- System context for GPT-3.5. Optional, default 'You are a helpful assistant.'.
  temperature -- Temperature parameter for GPT-3.5. Optional, default 0.
  max_tokens  -- Max Token Parameter for GPT-3.5. Optional, default 1000.
  details     -- Whether or not the full API response (incl. headers etc.) or just the model's answer will be returned. Optional, default False.

  Output: The response from the GPT-3.5 API. Depending on the 'details' parameter, either the full API response (list/dict) or just GPT's answer (string).
  """
  # url = url_gpt35_16k if large else url_gpt35
  url = url_gpt35_16k
  headers = api_header
  data = {
    "messages":[
      {"role": "system", "content": context},
      {"role": "user", "content": prompt+"\n"}
    ],
    "temperature": temperature, 
    "max_tokens": max_tokens
  }
  # Call POST to run Completion on a deployment
  for i in range(tries):
    response = None
    try:
      response = requests.post(url, headers=headers, json=data)
    except Exception as err:
      print("ERROR - Error in GPT response. No response generated. Retrying in 60 seconds.")
      print(err)
      if i < tries-1:
        time.sleep(60)
      continue
    try:
      print(response.json())
      return response.json()['choices'][0]['message']['content'] if details == False else response.json()
    except Exception as err:
      sleeptime = 15 # default, wait 15 seconds before retry
      print("ERROR - Error in GPT response. Full response:")
      print(response.text)
      if i < tries-1:
        search = re.search(r"Please retry after ([\d]*) seconds", response.text)
        if search: sleeptime = int(search.group(1))
        print(f"Trying again in {sleeptime} seconds.")
        time.sleep(sleeptime)
      continue

### Prototype - Class to enable GPT-4 prompts with memory of previous prompts
class gpt4_withMemory:
  def __init__(self, context:str = "You are a helpful assistant."):
    self.context = context
    self.memory = [
      {"role": "system", "content": self.context}
    ]
  def prompt(self, prompt:str, temperature:float = 0.0, max_tokens:int = 300, details:bool = False, large = False):
    self.memory.append({"role": "user", "content": prompt+"\n"})
    url = url_gpt4_32k if large else url_gpt4
    headers = api_header
    data = {
      "messages": self.memory,
      "temperature": temperature, 
      "max_tokens": max_tokens
    }
    # Call POST to run Completion on a deployment
    response = requests.post(url, headers=headers, json=data)
    self.memory.append({"role": "assistant", "content": response.json()['choices'][0]['message']['content']})
    return response.json()['choices'][0]['message']['content'] if details == False else response.json()
  
  def prompt_with_context(self, prompt:str, context, temperature:float = 0.0, max_tokens:int = 1000, details:bool = False, large = False):
    docs = context.similarity_search(prompt)
    full_prompt = f"""
      Given the following extracted parts of a long maintenance manual and a question, create a final answer.
      Maintenance Manual:{docs}
      Question: {prompt}
      """
    return self.prompt(prompt=full_prompt, temperature=temperature, max_tokens=max_tokens, details=details, large=large)

##### 5. Functions For Manipulating Word Documents

In [0]:
#########################################
#     Core Functions for Word/docx      #
#########################################
### import unicodedata
### txt = " weird word text"
### cleaned_up_text = unicodedata.normalize("NFKD",text)
# Helper function for retrieving the token count of a text
def count_tokens(text, model_name="gpt-4"):
  encoding = tiktoken.encoding_for_model(model_name)
  num_tokens = len(encoding.encode(text))
  return num_tokens
    
# Get section headers for tables recursively
def get_table_title(table):
    return Paragraph(table._element.getprevious(), table._element.getprevious().getparent()).text

# Slices a list l into n chunks (sequentially). E.g. [1, 2, 3, 4, 5] would become [[1, 2], [3, 4], [5]] for n=3.
def slice_list(l, n):
    d, r = divmod(len(l), n)
    for i in range(n):
        si = (d+1)*(i if i < r else r) + d*(0 if i < r else i - r)
        yield l[si:si+(d+1 if i < r else d)]

# Running paragraphs content between '<' and '>'
def runningparagraphfunc(text,runningparagraph,unclosedstarttext,startingindex,index):
  unclosedplaceholderlist=re.findall(r'<[^>]*$',text)
  closedplaceholderlist=re.findall(r'^[^<]*>',text)
  if len(unclosedplaceholderlist)>0:
      unclosedstarttext=unclosedplaceholderlist[0]
      runningparagraph=unclosedstarttext
      startingindex=index

  if len(runningparagraph)>0:
      if len(closedplaceholderlist)==0 and len(unclosedplaceholderlist)==0:
          runningparagraph=(runningparagraph+'\n'+text).strip('\n')
      elif len(closedplaceholderlist)>0 and len(unclosedplaceholderlist)==0:
          runningparagraph=(runningparagraph+'\n'+closedplaceholderlist[0]).strip('\n')
  return startingindex,unclosedstarttext,runningparagraph

# Read the Word documentd
def read_document(file_path, max_tokens_per_chunk=3000):
    doc = Document(file_path)
    originalpositions = {"paragraphs": [], "tables": []}
    placeholders = {"paragraphs": [], "tables": []}
    current_section = None
    table_current_section = None
    headinglist=[]
    runningparagraph=""
    unclosedstarttext=""
    startingindex=0

    for index, paragraph in enumerate(doc.paragraphs):
        if paragraph.style.name.startswith('Heading'):
            headinglist.append(paragraph.text)
            current_section = paragraph.text
            runningparagraph=""
            unclosedstarttext=""
            startingindex=0
        
        placeholder_matches=re.findall(r'<[^>]*>', paragraph.text)
        if placeholder_matches:
            for placeholder in placeholder_matches:
                if [obj for obj in placeholders["paragraphs"] if obj['section_header']==current_section and obj['placeholder_text']==placeholder]==[]:
                    placeholders["paragraphs"].append({
                        "section_header": current_section,
                        "placeholder_text": placeholder,
                        "replacement_text": ""})
                originalpositions["paragraphs"].append({
                    "section_header": current_section,
                    "placeholder_text": placeholder,
                    "paragraph_id": index,
                    "original_text": placeholder,
                    "keep_original_text": False})
        
        startingindex,unclosedstarttext,runningparagraph=runningparagraphfunc(paragraph.text,runningparagraph,unclosedstarttext,startingindex,index)
        
        if len(re.findall(r'^[^<]*>',paragraph.text))>0:
            if [obj for obj in placeholders["paragraphs"] if obj['section_header']==current_section and obj['placeholder_text']==runningparagraph]==[]:
                placeholders["paragraphs"].append({
                        "section_header": current_section,
                        "placeholder_text": runningparagraph,
                        "replacement_text": ""})
            originalpositions["paragraphs"].append({
                    "section_header": current_section,
                    "placeholder_text": runningparagraph,
                    "paragraph_id": startingindex,
                    "original_text": unclosedstarttext,
                    "keep_original_text": True})

            runningparagraph=""

    cellheadermetadata=pd.DataFrame()
    cellheaderlist=[]
    tableindexlist=[]
    rowindexlist=[]
    cellindexlist=[]
    prevsectionheading=''

    #Table cell headers
    for table_index, table in enumerate(doc.tables):
        for row_index, row in enumerate(table.rows):
            for cell_index, cell in enumerate(row.cells):
                cellallcontent=''
                for cell_paragraph in cell.paragraphs:
                    cellallcontent=(cellallcontent+'\n'+cell_paragraph.text).strip('\n')
                if re.findall(r'<[^>]*>', cellallcontent)==[]:
                    cellheaderlist.append(cellallcontent)
                    tableindexlist.append(table_index)
                    rowindexlist.append(row_index)
                    cellindexlist.append(cell_index)
    cellheadermetadata['table index']=pd.Series(tableindexlist)
    cellheadermetadata['row index']=pd.Series(rowindexlist)
    cellheadermetadata['cell index']=pd.Series(cellindexlist)
    cellheadermetadata['cell header metadata']=pd.Series(cellheaderlist)

    # cumulative concat headers, not considered for time being
    # cellheadermetadata['column header metadata']=cellheadermetadata.groupby(['table index','cell index'])['cell header metadata'].transform(lambda g: g.add('>').cumsum()).str.rstrip('>')
    # cellheadermetadata['row header metadata']==cellheadermetadata.groupby(['table index','row index'])['cell header metadata'].transform(lambda g: g.add('>').cumsum()).str.rstrip('>')

    for table_index, table in enumerate(doc.tables):
        sectionheading=''
        inputelement=table._element
        while sectionheading not in headinglist:
          try:
            inputelement=inputelement.getprevious()
            sectionheading=Paragraph(inputelement, inputelement.getparent()).text
          except:
            sectionheading=prevsectionheading
        prevsectionheading=sectionheading
        for row_index, row in enumerate(table.rows):
            for cell_index, cell in enumerate(row.cells):
                runningparagraph=""
                unclosedstarttext=""
                startingindex=0
                cellallcontent=''
                for cell_paragraph in cell.paragraphs:
                    #calculate last column and row headers for cell
                    try:
                      columnheader=cellheadermetadata.loc[(cellheadermetadata['row index']<row_index)&(cellheadermetadata['cell index']==cell_index)&(cellheadermetadata['table index']==table_index),'cell header metadata'].tail(1).values[0]
                    except:
                      columnheader=''
                    try:
                      rowheader=cellheadermetadata.loc[(cellheadermetadata['cell index']<cell_index)&(cellheadermetadata['row index']==row_index)&(cellheadermetadata['table index']==table_index),'cell header metadata'].tail(1).values[0]
                    except:
                      rowheader=''
                    #generate final section header value
                    if columnheader.lower().replace(' ','')!=sectionheading.lower().replace(' ',''):
                      columnheaderval=columnheader.strip()
                    else:
                      columnheaderval=''
                    if rowheader.lower().replace(' ','')!=sectionheading.lower().replace(' ','') and rowheader.lower().replace(' ','')!=columnheader.lower().replace(' ',''):
                      rowheaderval=rowheader.strip()
                    else:
                      rowheaderval=''
                    final_section_header=(sectionheading.strip()+'>'+(columnheaderval+'>'+rowheaderval).strip('>')).strip('>')
                    #check for placeholders to be replaced
                    placeholder_matches = re.findall(r'<[^>]*>', cell_paragraph.text)
                    if placeholder_matches:
                        for placeholder in placeholder_matches:
                            if [obj for obj in placeholders["tables"] if obj['section_header']==final_section_header and obj['placeholder_text']==placeholder]==[]:
                                placeholders["tables"].append({
                                  "section_header": final_section_header,
                                  "placeholder_text": placeholder,
                                  "replacement_text": ""})
                            originalpositions["tables"].append({
                                "section_header": final_section_header,
                                "placeholder_text": placeholder,
                                "location": {
                                    "table_id": table_index, "row_id": row_index, "cell_id": cell_index
                                    },
                                "original_text": placeholder,
                                "keep_original_text": False})

                    startingindex,unclosedstarttext,runningparagraph=runningparagraphfunc(cell_paragraph.text,runningparagraph,unclosedstarttext,startingindex,index=0)
            
                    if len(re.findall(r'^[^<]*>',cell_paragraph.text))>0:
                        if [obj for obj in placeholders["tables"] if obj['section_header']==final_section_header and obj['placeholder_text']==runningparagraph]==[]:
                            placeholders["tables"].append({
                                    "section_header": final_section_header,
                                    "placeholder_text": runningparagraph,
                                    "replacement_text": ""})
                        originalpositions["tables"].append({
                                "section_header": final_section_header,
                                "placeholder_text": runningparagraph,
                                "location": {
                                    "table_id": table_index, "row_id": row_index, "cell_id": cell_index
                                    },
                                "original_text": unclosedstarttext,
                                "keep_original_text": True})
                        runningparagraph=""

    token_count_paragraphs = count_tokens(json.dumps(placeholders["paragraphs"]))
    token_count_tables = count_tokens(json.dumps(placeholders["tables"]))
    # print("Token count for Paragraphs placeholder dict:", token_count_paragraphs)
    # print("Token count for Tables placeholder dict:", token_count_tables)

    if token_count_paragraphs > max_tokens_per_chunk:
      n = ceil(token_count_paragraphs / max_tokens_per_chunk)
      print(f"Warning: Token count for paragraphs is {token_count_paragraphs} and thus higher than the specified max ({max_tokens_per_chunk}). Splitting paragraphs into {n} chunks, retrieve them by using placeholders['paragraphs']['chunks'][n] or increase the max_tokens_per_chunk parameter.")
      chunks = list(slice_list(placeholders["paragraphs"], n))
      placeholders["paragraphs"] = {"chunks": chunks}
    else:
      placeholders["paragraphs"] = {"chunks": [placeholders["paragraphs"]]}

    if token_count_tables > max_tokens_per_chunk:
      n = ceil(token_count_tables / max_tokens_per_chunk)
      print(f"Warning: Token count for tables is {token_count_tables} and thus higher than the specified max ({max_tokens_per_chunk}). Splitting tables into {n} chunks, retrieve them by using placeholders['tables']['chunks'][n] or increase the max_tokens_per_chunk parameter.")
      chunks = list(slice_list(placeholders["tables"], n))
      placeholders["tables"] = {"chunks": chunks}
    else:
      placeholders["tables"] = {"chunks": [placeholders["tables"]]}

    return doc, placeholders, originalpositions

In [0]:
#########################################
#     Core Functions for Word/docx      #
#########################################

def write_back_to_document(documentinput, placeholders, originalpositions, verbose=False):
  
  #document=deepcopy(documentinput) DOES NOT WORK
  # with deepcopy(), document.save() will save the previous version (i.e. nothing replaced -.-)
  # This seems to be an undocumented Bug in pythondocx ...
  # Workaround:
  documentinput.save("tmp_clone.docx")
  document = Document("tmp_clone.docx")
  
  def replace_in_paragraph(document, paragraph_id, placeholder_text, replacement_text, verbose):
    paragraph = document.paragraphs[paragraph_id]
    paragraph.text = paragraph.text.replace(placeholder_text, replacement_text)
    if verbose: print(f"[Paragraph {paragraph_id}] Replaced '{placeholder_text}' with '{replacement_text}'")

  def replace_in_table(document, table_id, row_id, cell_id, placeholder_text, replacement_text, verbose):
    cell = document.tables[table_id].rows[row_id].cells[cell_id]
    cell.text = cell.text.replace(placeholder_text, replacement_text)
    if verbose: print(f"[Table {table_id} - Row {row_id} Column {cell_id}] Replaced '{placeholder_text}' with '{replacement_text}'")

  for chunk in placeholders["paragraphs"]["chunks"]:
    for chunkindex,chunkelem in enumerate(chunk):
      for index,elem in enumerate([obj for obj in originalpositions["paragraphs"] if obj['section_header']==chunkelem["section_header"] and obj['placeholder_text']==chunkelem["placeholder_text"]]):
        if elem['keep_original_text']==False:
          replacement_text=chunkelem['replacement_text']
        else:
          replacement_text=chunkelem['replacement_text']+'\n\n'+elem["original_text"]
        replace_in_paragraph(document, elem["paragraph_id"], elem["original_text"], replacement_text, verbose=verbose)

  for chunk in placeholders["tables"]["chunks"]:
    for chunkindex,chunkelem in enumerate(chunk):
      for index,elem in enumerate([obj for obj in originalpositions["tables"] if obj['section_header']==chunkelem["section_header"] and obj['placeholder_text']==chunkelem["placeholder_text"]]):
        if elem['keep_original_text']==False:
          replacement_text=chunkelem['replacement_text']
        else:
          replacement_text=chunkelem['replacement_text']+'\n\n'+elem["original_text"]
        replace_in_table(document, elem["location"]["table_id"],elem["location"]["row_id"], elem["location"]["cell_id"], elem["original_text"], replacement_text, verbose=verbose)
      
  return document

In [0]:
## create list of headings from a filepath, used primarilty in validation testing

def create_heading_list(filepath):
  doc = Document(filepath)
  headinglist=[]

  for index, paragraph in enumerate(doc.paragraphs):
    if paragraph.style.name.startswith('Heading') and len(paragraph.text.strip())>0:
       headinglist.append(paragraph.text)
  return headinglist

In [0]:
## split text to df based on headers, primarily used in validation

def split_text_to_df(text, header_titles):
    sections = text.split("\n\n")
    headers = []
    section_texts = []
    current_text= ""
    counter=0
    for item in sections:
        if item in header_titles and item != "":
            counter=counter+1
            headers.append(item)
            if counter>1:
                section_texts.append(current_text)
            current_text= ""
        elif item not in header_titles and item != "":
            current_text=(current_text+"\n\n"+item).strip("\n\n")
    section_texts.append(current_text)
    df = pd.DataFrame({'Headers': headers, 'Sections': section_texts})
    return df

In [0]:
def remove_dups_from_text(text1, text2):
  text1_split=text1.split("\n")
  text2_split=text2.split("\n")
  text1_cleansed=""
  text2_cleansed=""
  for item in text1_split:
    if item not in text2_split or item.strip()=="":
      text1_cleansed=text1_cleansed+"\n"+item
  for item in text2_split:
    if item not in text1_split or item.strip()=="":
      text2_cleansed=text2_cleansed+"\n"+item
  return text1_cleansed.strip("\n"),text2_cleansed.strip("\n")

##### 6. Functions for Retrieving Indexes

In [0]:
def read_index_func(index_dir,index_name,llm=llm,chain_type="stuff"):

  ## Create folder structure if not already present
  if not os.path.exists(index_dir):
      os.makedirs(index_dir)

  db_ws.CopyFileFromWorkbench("file:"+index_dir, filename=index_name)

  with open(os.path.join(index_dir,index_name), "rb") as f:
    db_fdd = pickle.load(f)

  index_fdd= RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db_fdd.as_retriever())
  return index_fdd, db_fdd

In [0]:
def read_index_func_params(index_dir,index_name,llm=llm,chain_type="stuff",search_type="similarity_score_threshold",score_threshold=.5,k=4):

  ## Create folder structure if not already present
  if not os.path.exists(index_dir):
      os.makedirs(index_dir)

  db_ws.CopyFileFromWorkbench("file:"+index_dir, filename=index_name)

  with open(os.path.join(index_dir,index_name), "rb") as f:
    db_fdd = pickle.load(f)

  index_fdd= RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db_fdd.as_retriever(search_type=search_type, search_kwargs={"score_threshold": score_threshold, "k": k}))
  return index_fdd, db_fdd

##### Functions for Creating Whoosh Indexes

In [0]:
def read_documents(zip_filename, zip_dir, blacklist_docs=[]):
  # load and chunk word documents
  print("Parsing")
  documents = parse_fsds(blacklist_docs=blacklist_docs, kg_structure_name=zip_filename, kg_directory=zip_dir)
  # warnings.filterwarnings("ignore")
  return documents

def read_documents_transcripts(doc_path, combine=True, heading=True):
  
  ## Read the transcripts
  transcripts_new = parse_transcripts(doc_path,combine=combine,heading=heading)

  ## Quick intermediate step - Write restructured back to JSON (needed for the JSONLoader below)
  intermediate_path = "/tmp/SAP/cleansed_transcript_new_restructured.json"
  with open(intermediate_path, 'w+') as f:
      json.dump(transcripts_new, f)

  ## Required to retrieve filename as metadata
  def metadata_func(record: dict, metadata: dict) -> dict:
      metadata["transcript_name"] = record.get("filename")
      return metadata

  ## Load restructured JSON into langchain (create documents)
  loader = JSONLoader(
      file_path=intermediate_path,
      jq_schema='.[]',
      content_key="text",
      metadata_func = metadata_func)
  texts = loader.load()

  return texts

def chunk_documents(documents, chunk_size=5000, chunk_overlap=1000, heading=True):
  print("Chunking")
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) #TokenTextSplitter
  texts = text_splitter.split_documents(documents)
  if heading:
    for index,chunk in enumerate(texts):
      source_filepath=chunk.metadata['source']
      source_filename=source_filepath[:source_filepath.rfind(".")].split('/')[-1]
      texts[index].page_content='Document Name: '+source_filename+'\n\n'+chunk.page_content
  return texts

In [0]:
def create_embedded_index(documents,index_dir,index_name,embedding_type='OpenAI',index_type='FAISS', save_to_workbench=True):

  print("Creating embeddings")
  # create embeddings
  if embedding_type.replace(' ','').lower()=='huggingface':
    embeddings = HuggingFaceEmbeddings()
  elif embedding_type.replace(' ','').lower()=='openai':
    embeddings = OpenAIEmbeddings(
      deployment=deployment_id_ada,
      model="text-embedding-ada-002",
      openai_api_key=openai_api_key,
      chunk_size=1
    )

  # create vector store
  print("Embedding text")
  if index_type.replace(' ','').lower()=='faiss':
    db = FAISS.from_documents(documents, embeddings)
  # elif index_type.replace(' ','').lower()=='chroma':
  #   db = Chroma.from_documents(texts, embeddings)

  if not save_to_workbench or index_dir==None or index_name==None :
    return db
  
  else:
    print("Saving index to file")
    ## Save in local directory
    if not os.path.exists(index_dir):
      os.makedirs(index_dir)

    with open(os.path.join(index_dir,index_name), "wb") as f:
      pickle.dump(db,f)
    db_ws.CopyFileToWorkbench(filename=index_name, DatabricksFolder="file:"+index_dir)
    print('Index'+index_name+' pickled and copied to Workbench')
    return db

In [0]:
class DocumentIndexWhoosh():
  def __init__(self, documents, index_path_out, stem=True):

    ## Create schema for indexing
    stem_ana = StemmingAnalyzer()
    schema = Schema(title=ID(stored=True), content=TEXT(stored = True), content_stem=TEXT(analyzer=stem_ana))
    
    ## Create index
    if not os.path.exists(index_path_out):
      os.mkdir(index_path_out)
    else:
      shutil.rmtree(index_path_out)
      os.mkdir(index_path_out)
    ix = create_in(index_path_out, schema)
    writer = ix.writer()

    document_dict = []
    for document in documents:
      document_dict.append({"text": document.page_content, "filename": document.metadata["source"]})

    ## Fill index with documents
    for document in document_dict:
      writer.add_document(title=document["filename"], content=document["text"], content_stem=document["text"])
    writer.commit()
    self._index = ix

    ## Configure parser to parse search queries
    search_field = "content"
    if stem: search_field = "content_stem"
    parser = QueryParser(search_field, self._index.schema, group=OrGroup.factory(0.9))
    # using OrGroup.factory() with a high value adds a bonus if multiple query words are retrieved (see https://whoosh.readthedocs.io/en/latest/parsing.html)
    parser.add_plugin(FuzzyTermPlugin())
    self._parser = parser
  
  def search(self, query, k=5, stem=True, fuzzy_dist=0):
    if stem:
      ana = StemmingAnalyzer()
      query = " ".join([token.text + "~" + str(fuzzy_dist) for token in ana(query)]) #'~1' says that each word can be one character off (fuzzy search)
    else:
      query = " ".join([word + "~" + str(fuzzy_dist) for word in query.split(" ")])
    try:
      # with self._index.searcher() as searcher:
      searcher = self._index.searcher()
      parsed_query = self._parser.parse(query)
      res = searcher.search(parsed_query, terms=True, limit=k)
      # Return only the content of the best result (to be able to close the searcher().
      # If returning res in full, we'd get an "ReaderClosed" exception.)
      return res
    except Exception as err:
      print("Error in searching whoosh index. Returning empty string (''). Traceback: " + str(err))
      return ""
    
  def search_and_summarize(self, search_term, k=3, stem=True, fuzzy_dist=0):
    res = "None"
    try:
    
      # Get initial results from whoosh search
      whoosh_results = self.search(search_term, stem=stem, fuzzy_dist=fuzzy_dist)

      # avoid index errors (e.g. when we specify top 5, but whoosh only retrieves 3 docs)
      if len(whoosh_results) < k:
        k = len(whoosh_results)
      
      # concatenate top k results from the whoosh search in plain text
      top_k_docs = [entry["content"] for entry in whoosh_results[:k]]

      # have GPT summarize the whoosh-retrieved documents (with a focus on the search keywords)
      summary_prompt = f"""
      What information from the documents below can be leveraged to build the following requirement '{search_term}'. Elaborate as much as possible. Documents:
      '''{top_k_docs}'''
      """
      res = gpt4(summary_prompt, large=True, max_tokens=10000, tries=3, temperature=.2)

    except Exception as err:
      log("[Whoosh Retrieval] Error in searching documents using whoosh & GPT. Returning 'None'. Traceback: ")
      print(err)

    finally:
      return res
    
## Could also be loaded via ix = index.open_dir("indexdir")

##### Functions for creating a combined Whoosh & FAISS index

In [0]:
class CombinedIndex():

  def __init__(self, faiss_index, whoosh_index):
    self._faiss_index = faiss_index
    self._whoosh_index = whoosh_index
    self._validate_chunk_equality()

  def _validate_chunk_equality(self):
    ## Check if individual whoosh and faiss chunks are identical
    chunks_whoosh = []
    chunks_faiss = []
    for doc in self._whoosh_index._index.searcher().documents():
      chunks_whoosh.append(doc["content"])
    for doc_id in self._faiss_index.docstore._dict.keys():
      chunks_faiss.append(self._faiss_index.docstore._dict[doc_id].page_content)
    if not len(chunks_whoosh) == len(chunks_faiss): raise ValueError("Whoosh and FAISS have do not have the same amount of chunks")
    for i in range(len(chunks_faiss)):
      if not chunks_whoosh[i] == chunks_faiss[i]: raise ValueError("Whoosh and FAISS chunk text different for chunk number " + str(i))

  def search_both_indexes(self, query, k=5, verbose=False):
    whoosh_results_raw = self._whoosh_index.search(query, k=k)
    k_whoosh = min(k, len(whoosh_results_raw)) # avoid index errors (e.g. when we specify top 5, but whoosh only retrieves 3 docs)
    if verbose and k_whoosh < k: log(f"[Combined Index Lookup] Whoosh only retrieved k={k_whoosh} results. You specified k={k}.")
    whoosh_results = [entry["content"] for entry in whoosh_results_raw[:k_whoosh]]

    faiss_results_raw = self._faiss_index.similarity_search(query, k)
    k_faiss= min(k, len(faiss_results_raw)) # avoid index errors (e.g. when we specify top 5, but whoosh only retrieves 3 docs)
    if verbose and k_faiss < k: log(f"[Combined Index Lookup] FAISS only retrieved k={k_faiss} results. You specified k={k}.")
    faiss_results = [entry.page_content for entry in faiss_results_raw[:k_faiss]]

    combined_results = [x for x in itertools_chain.from_iterable(zip_longest(whoosh_results,faiss_results)) if x]
    # complicated, but needed to preserve scoring order. essentially, this constructes an intertwined list aka [whoosh_1, faiss_1, whoosh_2, faiss_2, ...]
    # instead of a naive append ([whoosh_1, whoosh_2, ... , faiss_1, faiss_2] or vice versa).
    # note that this prioritizes whoosh over faiss for the first result - change to zip_longest(faiss_results, whoosh_results) to reverse

    duplicates = set()
    unique_results = []
    for x in combined_results:
      if x not in unique_results:
        unique_results.append(x)
      else:
        duplicates.add(x)
    if verbose and len(duplicates) > 0: log(f"[Combined Index Lookup] FAISS and Whoosh returned partially same results. Removed {len(duplicates)} duplicates.")

    return unique_results
  
  def search_and_summarize(self, query, k=5, verbose=False):
    res = "None"
    try:
    
      # Get initial results from whoosh search
      results = self.search_both_indexes(query=query, k=k, verbose=verbose)

      # have GPT summarize the whoosh-retrieved documents (with a focus on the search keywords)
      summary_prompt = f"""
      For each of the documents below, list all detailed granular information that can be leveraged to build the requirement '{query}'. Response should be each document name followed by relevant info. Elaborate as much as possible. Documents:
      '''{results}'''
      """
      res = gpt4(summary_prompt, large=True, max_tokens=5000, tries=3, temperature=.2)

    except Exception as err:
      log("[Combined Index Lookup] Error in searching meeting transcripts using the combined Whoosh & FAISS index. Returning 'None'. Traceback: ")
      print(err)

    finally:
      return res

##### 7. Functions for Retrieving Knowledge Base

In [0]:
def read_knowledge_base(kb_dir, kb_name):

  ## Create folder structure if not already present
  if not os.path.exists(kb_dir):
      os.makedirs(kb_dir)

  db_ws.CopyFileFromWorkbench("file:"+kb_dir, filename=kb_name)

  with open(os.path.join(kb_dir, kb_name), "r") as f:
    kb = json.load(f)

  return kb

##### 8. Functions for Parsing Transcripts

In [0]:
def parse_transcripts(doc_path,combine=True,heading=False):
  ## Load JSON into memory
  # json.load() directly doesn't work for some reason
  with open(doc_path, "r") as f:
    content = f.read()
  transcripts = json.loads(content)

  ## Restructure the JSON for further processing
  transcripts_new = []
  if combine:
    for transcript_name in transcripts:
      if heading:
        filename_heading='Document Name: '+transcript_name+'\n\n'
      else:
        filename_heading=''
      full_text = "\n\n".join(transcripts[transcript_name])
      transcripts_new.append({"filename" : transcript_name, "text" : filename_heading+full_text})
  else:
    for transcript_name in transcripts:
      if heading:
        filename_heading='Document Name: '+transcript_name+'\n\n'
      else:
        filename_heading=''
      for index,text in enumerate(transcripts[transcript_name]):
        transcripts_new.append({"filename" : transcript_name+' chunk '+str(index+1), "text" : filename_heading+text})
  
  return transcripts_new

In [0]:
def read_transcripts(transcript_path, txt_transcripts):
  with open(os.path.join(transcript_path,txt_transcripts), "r") as f:
    content = f.read()
    transcripts = json.loads(content)

  transcripts_new = []
  for transcript_name in transcripts:
    full_text = "\n\n".join(transcripts[transcript_name])
    transcripts_new.append({"filename" : transcript_name, "text" : full_text})
  return transcripts_new


def requirements_matching(df_requirements, transcripts_new, requirements_chunk_size=5000):
  
  ## first break out the transcripts text into a dictionary scructure

  ## write the prompt
  prompt =  """###Start of listing of requirement names
  {requirement_names}
  ###End of listing of requirement names
 
  ###Start of transcript
  {transcript_chunk}
  ###End of transcript
 
  From the above requirement names and transcript, list the requirement names that may have some relation to portions of the transcript content using the JSON example template below, value of 'requirement name' element must be selected from listing of requirement names above (don't change the original requirement name value, don't need to return requirement names that did not find any relevant transcript content), the 'portions of transcript content' elements should be populated by information from the transcript section above only (never populate from listing of requirement names), only return the JSON list as your response, data type of 'portions of transcript content' element must be text, ensure character escape is implemented where applicable so that json.loads() can read your response (always escape '\' with '/\' instead of '\\'):
 
  [{"requirement name": "ABC", "portions of transcript content": "UVW"},
  {"requirement name": "DEF", "portions of transcript content": "XYZ"}]"""


  projectbackground = "This project is an SAP S4 transformation focusing on finance, using SAP's fit to standard approach."

  systemprompt = "You specialize in mapping portions of transcripts to requirement names for projects. " + projectbackground

  list_requirements = df_requirements['Requirement Cleansed'].tolist()
  token_count_requirement=count_tokens(str(list_requirements))

  if token_count_requirement > requirements_chunk_size:
      n = ceil(token_count_requirement / requirements_chunk_size)
      print(f"Warning: Token count for requirement list is {token_count_requirement} and thus higher than the specified max ({requirements_chunk_size}). Splitting requirement list into {n} chunks.")
      chunks = list(slice_list(list_requirements, n))
      requirement_chunks = {"chunks": chunks}
  else:
      n=1
      requirement_chunks = {"chunks": [list_requirements]}

  ## for testing shorter list of transcript chunks
  # list_short = transcripts_new[:3]

  # ## loop through
  for index,transcript in req_trans_control_table[req_trans_control_table['status']==''].iterrows():
    transcript_response=[]
    for chunk_index,chunk in enumerate(requirement_chunks["chunks"]):
      chunk_requirements_list='\n\n'.join(chunk)
      current_prompt = prompt.replace("{requirement_names}", chunk_requirements_list).replace("{transcript_chunk}", str(transcript['text']))
      #log(current_prompt) #DEBUG
      # response = gpt4(prompt=current_prompt, context=systemprompt, temperature=temperature, max_tokens=15000, large=True, tries=3)
      res = gpt35(prompt=current_prompt, temperature=temperature, max_tokens=8000, tries=3)
      # log(response) #DEBUG
      req_trans_control_table.loc[index,'response']=response
      # if 1==1:
      try:
        response_json=json.loads(response)
        transcript_response=transcript_response+response_json
        if chunk_index+1==n:
          req_trans_control_table.loc[index,'response']=json.dumps(transcript_response)
          req_trans_control_table.loc[index,'status']='complete'
        print("transcript "+str(index+1)+" chunk "+str(chunk_index+1)+" processed...")
      except:
        print(f"""JSON conversion failed:
              {response}""")



  ## for testing single prompt

  # log("testing with requirement name")

  # current_prompt = prompt.replace("{requirement_names}", str(df_requirements['RICEFW Name'].tolist())).replace("{transcript_chunk}", str(transcripts_new[0]))

  # response = gpt4(prompt=current_prompt, context=systemprompt, temperature=temperature, max_tokens=20000, large=True, tries=3)


  # return response


In [0]:
def read_requirements_matching(path, filename):
  
  ## Create folder structure if not already present
  if not os.path.exists(path):
      os.makedirs(path)

  db_ws.CopyFileFromWorkbench("file:"+path, filename=filename)

  df = pd.read_excel(os.path.join(path,filename))

  return df

##### Functions for parsing sample FSDs

In [0]:
def parse_fsds(blacklist_docs=[],kg_structure_name='FSD',kg_directory='/SAP_KG/'):
  ## Define location of Indexing files & respective folder names
  kg_structure = [kg_structure_name]
  ## Load all files and convert them to usable format
  create_directories(kg_structure)
  unzip_and_convert(kg_structure,kg_directory,black_list_docs=blacklist_docs,convert_verbose=True)
  fsd_documents = DirectoryLoader(os.path.join(kg_directory,kg_structure_name)).load()
  return fsd_documents

##### 9. Functions for Parsing Templates (generate placeholder dicts)

In [0]:
def read_templates(input_folder, max_tokens_per_chunk=800):
  templatedict={}
  #Read Template Docs
  for row in df_mapping['Template Name'].drop_duplicates():
    if len(pathlib.Path(row).suffix)>0:
      templatefilename=row
    else:
      templatefilename=row+'.docx'
    try:
      print(f"\nParsing '{templatefilename}'")
      db_ws.CopyFileFromWorkbench(DatabricksFolder="file:"+input_folder, filename=templatefilename)
      doc, placeholders, originalpositions = read_document(input_folder+templatefilename,max_tokens_per_chunk=max_tokens_per_chunk)
    except:
      print('Template ' + templatefilename+' not found on Workbench! Please verify template is uploaded and template name is correct in '+filename_mapping)
    templatedict[row]={'placeholders':placeholders,'docs':doc,'originalpositions':originalpositions}
  
  return templatedict

In [0]:
def clean_transcript_match(dirty_df):
 return dirty_df['response'].DataFrame()


##### 10. Decorators and Control Logic

In [0]:
def keep_trying(func):
  def wrapper(*args, **kwargs):
    log("[CONTROL] INITIALIZING - Running main function body in 'keep_trying' mode.", color="blue")
    while True:
      try:
        exit = func(*args, **kwargs)
        if exit:
          log("[CONTROL] SUCCESS - Main function body terminated as per exit criteria.", color="green")
          break
        else:
          log("[CONTROL] [RERUN] Rerunning indefinitely as 'keep_trying' is on. Rerunning now.", color="blue")
      except Exception as err:
        log("[CONTROL] [CRITICAL ERROR] Error in main function body. Traceback: " + str(err), color="red")
        log("[CONTROL] [RERUN] Rerunning indefinitely as 'keep_trying' is on. Rerunning now.", color="blue")
        continue
  return wrapper

#####  11. Functions for Validation of provided documents

In [0]:
def validate_docs(source_filepath, generated_filepath, verbose= False):

  source_filename =  source_filepath
  generate_filname = generated_filepath
  local_save_folder = "/tmp/SAP/"
  #why is file path wrong 
  Folder1= Folder1="/tmp/Input_SAP/"

  # dbutils.fs.mkdirs(local_save_folder) #comment

  source_docx =db_ws.CopyFileFromWorkbench(DatabricksFolder=local_save_folder,filename=source_filename)
  generate_docx =db_ws.CopyFileFromWorkbench(DatabricksFolder=local_save_folder,filename=generate_filname)

  source_doc = convert(source_filename,for_validation = True)
  generated_doc = convert(generate_filname, for_validation = True)

  header_list = create_heading_list(Folder1 + 'OGE_Functional Specification Document_ENH Template__Approved.docx')




  model = sentence_transformers.SentenceTransformer("sentence-transformers/bert-base-nli-mean-tokens")
  source_df = split_text_to_df(source_doc, header_list)
  generated_df = split_text_to_df(generated_doc, header_list)

  joined_df = source_df.set_index('Headers').join(generated_df.set_index('Headers'), lsuffix="_source", rsuffix="_generated", how= "inner")
  joined_df['cos_sim'] = ''
  if verbose == True:
    joined_df['source row processed'] = ''
    joined_df['gen row processed'] = ''
    joined_df['source row cleansed']=''
    joined_df['gen row cleansed']=''

  for index, row in joined_df.iterrows():
    ## we want to clean the inputs first by taking out any parts that do not match
    prefix = os.path.commonprefix([row['Sections_source'],row['Sections_generated']])

  

    if prefix == '':
      source_row_processed = row['Sections_source']
      gen_row_processed = row['Sections_generated']
      # joined_df.drop(index = row['Headers'])
    else:
      source_row_processed = row['Sections_source'][len(prefix):len(row['Sections_source'])]
      gen_row_processed = row['Sections_generated'][len(prefix):len(row['Sections_generated'])]
      if verbose == True:
        row['source row processed'] = source_row_processed
        row['gen row processed'] = gen_row_processed

    source_row_cleansed,gen_row_cleansed=remove_dups_from_text(source_row_processed,gen_row_processed)
    if verbose == True:
      row['source row cleansed'] = source_row_cleansed
      row['gen row cleansed'] = gen_row_cleansed

    ## then we do the 
    source_embedding= model.encode(source_row_cleansed, convert_to_tensor=True)
    gen_embedding = model.encode(gen_row_cleansed, convert_to_tensor=True)
    row['cos_sim']= util.pytorch_cos_sim(source_embedding, gen_embedding).tolist()[0][0]

    # cleaned_df = joined_df[joined_df['cos_sim']<.999]
    

  return joined_df


In [0]:
def cleanup_dupes(input_df):
  cleaned_df = input_df[input_df['cos_sim']<.999]
  return cleaned_df 