# Setup, Inits and Function Repository

## 1. Imports

In [0]:
%run "./SAP_FDD_Env_and_Utils"

In [0]:
%run "./SAP_FDD_Core"

In [0]:
%run "./SAP_FDD_Indexing"

# Workflow for multiple documents

## 1. Read input documents (requirements list & mapping to templates)

In [0]:
## Specify paths for input files
filename_mapping = "Mapping Template.xlsx"
filename_requirements = "Codes_mapping.xlsx"
filename_transcripts = "cleansed_transcript_new1.txt"

folder_input = "/tmp/Input_SAP/" 
folder_main = "/tmp/SAP"

In [0]:
# Copy cleansed transcripts from workbench - only needed on cluster restart
db_ws.CopyFileFromWorkbench(DatabricksFolder="file:"+folder_input,filename=filename_transcripts)

In [0]:
## Parse requirements
df_requirements_clean = get_df_requirements(folder_input, filename_requirements)
## Parse Mapping file
df_mapping = get_df_mapping(folder_input, filename_mapping)

##2. Load stored indexes

In [0]:
### Change these index names / locations whenever a new index version has been created!

# FSD INDEXES
FSD_INDEX_LATEST_VERSION_FAISS = "indexstore_FSD_OpenAI_v1_chunked.pkl"
FSD_INDEX_LATEST_VERSION_WHOOSH = "/tmp/SAP/index_fsd_whoosh"
FSD_INDEX_LATEST_VERSION_ACS = "uc-sap-index-fsd"

# BPD INDEXES
BPD_INDEX_LATEST_VERSION_FAISS = "indexstore_BPD_OpenAI_v3_chunked.pkl"
BPD_INDEX_LATEST_VERSION_WHOOSH = "/tmp/SAP/index_bpd_whoosh"
BPD_INDEX_LATEST_VERSION_ACS = "uc-sap-index-bpd"

# TRANSCRIPTS INDEXES
TRANSCRIPTS_INDEX_LATEST_VERSION_FAISS = "indexstore_Transcripts_OpenAI_v6_chunked.pkl"
TRANSCRIPTS_INDEX_LATEST_VERSION_WHOOSH = "/tmp/SAP/index_transcripts_whoosh"
TRANSCRIPTS_INDEX_LATEST_VERSION_ACS = "uc-sap-index-transcripts"

# IMAGES INDEXES
IMAGES_INDEX_LATEST_VERSION_ACS = "uc-sap-index-images"

In [0]:
### Load all index types for all input sources (loading is efficient, should only take a few seconds)

# FSDs
index_fsd_faiss, db_fsd_faiss = read_index_func(folder_main, FSD_INDEX_LATEST_VERSION_FAISS, llm=llm, chain_type="stuff")
# # index_fsd_whoosh = DocumentIndexWhoosh().load(FSD_INDEX_LATEST_VERSION_WHOOSH)
# index_fsd_combined = CombinedIndex(db_fsd_faiss, index_fsd_whoosh)
index_fsd_acs = DocumentIndexACS_v2(ACS_ENDPOINT, ACS_KEY, FSD_INDEX_LATEST_VERSION_ACS).load()

# BPDs
index_bpd_faiss, db_bpd_faiss = read_index_func(folder_main, BPD_INDEX_LATEST_VERSION_FAISS, llm=llm, chain_type="stuff")
# index_bpd_whoosh = DocumentIndexWhoosh().load(BPD_INDEX_LATEST_VERSION_WHOOSH)
# index_bpd_combined = CombinedIndex(db_bpd_faiss, index_bpd_whoosh)
index_bpd_acs = DocumentIndexACS_v2(ACS_ENDPOINT, ACS_KEY, BPD_INDEX_LATEST_VERSION_ACS).load()

# Transcripts
index_transcripts_faiss, db_transcripts_faiss = read_index_func(folder_main, TRANSCRIPTS_INDEX_LATEST_VERSION_FAISS, llm=llm, chain_type="stuff")
# index_transcripts_whoosh = DocumentIndexWhoosh().load(TRANSCRIPTS_INDEX_LATEST_VERSION_WHOOSH)
# index_transcripts_combined = CombinedIndex(db_transcripts_faiss, index_transcripts_whoosh)
index_transcripts_acs = DocumentIndexACS_v2(ACS_ENDPOINT, ACS_KEY, TRANSCRIPTS_INDEX_LATEST_VERSION_ACS).load()

# Image Processing
index_images_acs = DocumentIndexACS_v2(ACS_ENDPOINT, ACS_KEY, IMAGES_INDEX_LATEST_VERSION_ACS).load()

In [0]:
### Check the status for all index types and input sources - 'True' means the index is usable, 'False' indicates errors

test_query = "Test"

# print(f"FSD - FAISS Status: {len(db_fsd_faiss.similarity_search(test_query)) != 0}")
# print(f"FSD - Whoosh Status: {len(list(index_fsd_whoosh.search(test_query))) != 0}")
# print(f"FSD - Combined Status: {len(index_fsd_combined.search(test_query)) != 0}")
print(f"FSD - ACS Status: {len(index_fsd_acs.search(test_query)) != 0}")

# print(f"BPD - FAISS Status: {len(db_bpd_faiss.similarity_search(test_query)) != 0}")
# print(f"BPD - Whoosh Status: {len(list(index_bpd_whoosh.search(test_query))) != 0}")
# print(f"BPD - Combined Status: {len(index_bpd_combined.search(test_query)) != 0}")
print(f"BPD - ACS Status: {len(index_bpd_acs.search(test_query)) != 0}")

# print(f"Transcripts - FAISS Status: {len(db_transcripts_faiss.similarity_search(test_query)) != 0}")
# print(f"Transcripts - Whoosh Status: {len(list(index_transcripts_whoosh.search(test_query))) != 0}")
# print(f"Transcripts - Combined Status: {len(index_transcripts_combined.search(test_query)) != 0}")
print(f"Transcripts - ACS Status: {len(index_transcripts_acs.search(test_query)) != 0}")

print(f"Images - ACS Status: {len(index_images_acs.search(test_query)) != 0}")

In [0]:
### Choose which index types to use for each input source

index_fsd = index_fsd_acs
index_bpd = index_bpd_acs
index_transcripts = index_transcripts_acs
index_images = index_images_acs

In [0]:
print(index_fsd)

In [0]:
# knowledge_base = read_knowledge_base("/tmp/SAP/", KNOWLEDGE_BASE_LATEST_VERSION)

## 3. Select requirements to be populated

Remember to clean the dataframe with `.reset_index(drop=True)`!

In [0]:
# W_SCM_001 
# ERROR - Error in GPT response. No response generated. Retrying in 60 seconds.
# ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
# [{'section_header': 'Development Object details>RICEFW Object ID:', 'placeholder_text': '<RICEFW ID>', 'replacement_text': 'W_SCM_001'}, {'section_header': 'Development Object details>RICEFW Object Name:', 'placeholder_text': '<RICEFW Name>', 'replacement_text': 'PR Approval'}, {'section_header': 'Development Object details>Workstream:', 'placeholder_text': '<Choose one of the workstream below:\n(   ) HxM\t(   ) FI            (   ) WAM      (   ) IS-U\n(   ) Tax   \t(   ) Data\t(   ) SCM (   ) Tech>', 'replacement_text': '(   ) FI'}, {'section_header': 'Development Object details>Type of development', 'placeholder_text':

# W_FIN_001
#/json error ran again no prblems
# F_FIN_002
#good
# F_FIN_001
#good other than *** WARNING: skipped 6263 bytes of output ***
# E_WAM_PS_017
#Invalid JSON response, then good
# E_WAM_PS_016
#looks good
# E_FIN_002
#good
# E_FIN_001
#good
# c-fin001
# good but *** WARNING: skipped 74481 bytes of output ***
#E scm 029
#good but *** WARNING: skipped 24733 bytes of output ***

In [0]:
## Filter which requirements to process, e.g. one of each type

# To select specific requirement IDs:
df_requirements_sel = df_requirements_clean\
  .query('''`RICEFW ID` in ["E_SCM_029"]''')\
  .reset_index()\
  .drop('index', axis=1)

# # To select specific requirement IDs:
# df_requirements_sel = df_requirements_clean\
#   .query('''`RICEFW ID` in ["I_WAM_T&D_012"]''')\
#   .reset_index()\
#   .drop('index', axis=1)

# To do a full run:
# df_requirements_sel = df_requirements_clean[:1]

# To select n from each category
# n_per_category = 2
# counter=1
# for requirementtype in df_requirements_clean['RICEFW Type'].str.upper().drop_duplicates():
#   tempdf=df_requirements_clean[df_requirements_clean['RICEFW Type'].str.upper()==requirementtype].head(n_per_category)
#   if counter==1:
#     df_requirements_sel=tempdf
#   else:
#     df_requirements_sel=pd.concat([df_requirements_sel,tempdf],ignore_index=True)
#   counter=counter+1
# df_requirements_sel = df_requirements_sel.reset_index().drop('index', axis=1)

# Get only requirements with detailed descriptions
# df_requirements_sel = df_requirements_clean[df_requirements_clean['Detailed Description'].str.strip()!=''].reset_index(drop=True)

In [0]:
# df_results

In [0]:
df_control = get_df_control(df_requirements_sel)
df_results = get_df_results(df_requirements_sel)
df_validation = get_df_validation()
df_errors = get_df_errors()

## 4. Iterate through selected requirements and populate the templates for each one

### 4a. Main execution - Check here for runtime details & audit logs

In [0]:
# process_chunk("paragraph",
#                   index_chunk,
#                   chunk,
#                   output_dict,
#                   index_req,
#                   #row,
#                   prompt,
#                   systemprompt,
#                   citation_prompt,
#                   templatename,
#                   df_control,
#                   df_results,
#                   df_errors,
#                   df_validation,
#                   fsd_lookup,
#                   bpd_lookup,
#                   transcripts_lookup,
#                   images_lookup,
#                   knowledge_base_docs,
#                   temperature,
#                   verbose,
#                   validation,
#                   backup_strict,
#                   log_id_doc)
  
#   print(content_type)
#   print()
#   print(index_chunk)
#   print()
#   print(chunk)
#   print()
#   print(output_dict)
#   print()
#   print(index_req)
#   print()
#   print(prompt)
#   print()
#   print(systemprompt)
#   print()
#   print(citation_prompt)
#   print()
#   print(templatename)
#   print()
#   print(df_control)
#   print()
#   print(df_results)
#   print()
#   print(df_errors)
#   print()
#   print(df_validation)
#   print()
#   print(fsd_lookup)
#   print()
#   print(bpd_lookup)
#   print()
#   print(transcripts_lookup)
#   print()
#   print(images_lookup)
#   print()
#   print(knowledge_base_docs)
#   print()
#   print(temperature)
#   print()
#   print(verbose)
#   print()
#   print(validation)
#   print()
#   print(backup_strict)
#   print()
#   print(log_id_doc)
#   # raise Exception("skipped chunk processing in debug mode. Edit cmd 6 in SAP_FDD_CORE to turn off debug mode.") # DEBUG

#   # restartability idea - check if replacement texts for chunk already populated, if so, skip to next chunk
#   # this way, if the run gets interrupted, it can just be restarted
#   if content_type == "paragraphs":
#     if df_control.loc[index_req, "LastChunk_contenttype"] == "tables":
#       # if the paragraphs are done already (pointer is at tables), skip all paragraphs
#       log(f"[{log_id_doc}] {content_type} chunk {index_chunk+1}/{len(output_dict[content_type]['chunks'])} already populated. skipping to next chunk.")
#       return
#     elif df_control.loc[index_req, "LastChunk_contenttype"] == "paragraphs":
#       # if the paragraphs are started, but not done (pointer is at paragraphs)
#       if df_control.loc[index_req, "LastChunk_index"] >= index_chunk:
#         #  if the last written chunk number is higher than the current, skip current
#         log(f"[{log_id_doc}] {content_type} chunk {index_chunk+1}/{len(output_dict[content_type]['chunks'])} already populated. skipping to next chunk.")
#         return
#       # else: #  if the last written chunk number is lower than the current, go on and populate for current
#   elif content_type == "tables":
#     if df_control.loc[index_req, "LastChunk_contenttype"] == "tables":
#       # if the tables are started, but not done (pointer is at tables)
#       if df_control.loc[index_req, "LastChunk_index"] >= index_chunk:
#         #  if the last written chunk number is higher than the current, skip current
#         log(f"[{log_id_doc}] {content_type} chunk {index_chunk+1}/{len(output_dict[content_type]['chunks'])} already populated. skipping to next chunk.")
#         return
#       # else: #  if the last written chunk number is lower than the current, go on and populate for current
#     # else: if the pointer is still at paragraphs, go on and populate (happens only for first table after paragraphs are done)
#   # else: nothing populated yet (pointer is blank), go on and populate first paragraph & chunk

#   if df_control.loc[index_req, "LastChunk_contenttype"] == content_type and df_control.loc[index_req, "LastChunk_index"] >= index_chunk:
#     log(f"[{log_id_doc}] {content_type} chunk {index_chunk+1}/{len(output_dict[content_type]['chunks'])} already populated. skipping to next chunk.")
#     return  
  
#   log(f"[{log_id_doc}] Processing {content_type} chunk {index_chunk+1}/{len(output_dict[content_type]['chunks'])} ... ")
#   curr_prompt = prompt\
#     .replace("{requirement}", df_control.loc[index_req, "Requirement Cleansed"])\
#     .replace("{jsontemplate}", json.dumps(chunk))\
#     .replace("{RICEFWType}", df_control.loc[index_req, "RICEFW Type"])\
#     .replace("{RICEFWID}", df_control.loc[index_req, "RICEFW ID"])\
#     .replace("{RICEFWName}", df_control.loc[index_req, "RICEFW Name"])\
#     .replace("{requirement_detailed_description}",df_control.loc[index_req, "Detailed Description"])\
#     .replace("{citation_prompt}",citation_prompt)

#   if transcripts_lookup != None:
#     curr_prompt = curr_prompt.replace("{transcripts_lookup}", f"""\n\nInfo on requirement from workshop transcripts: \n```\n{transcripts_lookup}\n```\n""")
#   else: curr_prompt = curr_prompt.replace("{transcripts_lookup}", "")

#   if fsd_lookup != None:
#     curr_prompt = curr_prompt.replace("{fsd_lookup}", f"""\n\nInfo on requirement from FSDs for other projects (Use for reference only and do not copy as is, don't refer to specific people, entity or asset names): \n```\n{fsd_lookup}\n```\n""")
#   else: curr_prompt = curr_prompt.replace("{fsd_lookup}", "")

#   if bpd_lookup != None:
#     curr_prompt = curr_prompt.replace("{bpd_lookup}", f"""\n\nInfo on requirement from business process docs (Use for reference only and do not copy as is, don't refer to specific people, entity or asset names): \n```\n{bpd_lookup}\n```\n""")
#   else: curr_prompt = curr_prompt.replace("{bpd_lookup}", "")

#   if knowledge_base_docs != None:
#     kb_docs_lookup = knowledge_base_docs[templatename]["placeholders"][content_type]["chunks"][index_chunk]
#     curr_prompt = curr_prompt.replace("{knowledge_base_docs}", f"""\n\nExamples of 'what good looks like' from FSDs for other projects, not directly related to this requirement (Use for reference only and do not copy as is, don't refer to specific people, entity or asset names): \n```\n{kb_docs_lookup}\n```\n""")
#   else: curr_prompt = curr_prompt.replace("{knowledge_base_docs}", "")

#   if verbose: print(curr_prompt)

    

#   response = gpt4(prompt=curr_prompt, context=systemprompt, temperature=temperature, max_tokens=6000, large=True, tries=3).replace('\\\\','/').replace('}],','},')
#   #response = gpt35(prompt=curr_prompt, context=systemprompt, temperature=temperature, max_tokens=10000, tries=3)
  
#   if validation: 
#     # print("writing prompt to validation df") # DEBUG
#     val_row = {'RICEFW ID': df_control.loc[index_req, "RICEFW ID"],
#               'RICEFW Name': df_control.loc[index_req, "RICEFW Name"],
#               'Chunk Numb': index_chunk + 1,
#               'Prompt': curr_prompt,
#               'Response': response}
#     df_validation.loc[len(df_validation.index)] = val_row

#   fix_json_system_prompt='You are an expert at fixing JSONs'
#   fix_json_user_prompt='''Fix the following JSON so that it can be successfully loaded by json.loads, response should be only the fixed JSON:
#   {error_json}'''

#   try:
#     response_json = json.loads(response, strict=False)
#   except:
#     print("ERROR - Error in populating requirement. GPT response could not be parsed as JSON. Asking GPT to correct response...")
#     response=gpt4(prompt=fix_json_user_prompt.replace('{error_json}',response), context=fix_json_system_prompt, temperature=0, max_tokens=15000, large=True, tries=3)
#     try:
#       response_json = json.loads(response, strict=False)
#     except Exception as err:
#       # GPT returned ill-formatted JSON in its response -> Skip to next requirement
#       log(f"[{log_id_doc}] ERROR - Error in populating requirement. GPT response could not be parsed as JSON. This event has been noted in the error table.", color="red")
#       print("Traceback:")
#       print(err)
#       if verbose:
#         print("Full GPT Prompt response:")
#         print(response)
#       # Append incomplete response to error capturing dataframe
#       err_row = {'RICEFW ID': df_control.loc[index_req, "RICEFW ID"],
#                   'RICEFW Name': df_control.loc[index_req, "RICEFW Name"],
#                   'LastChunk_contenttype': content_type,
#                   'LastChunk_index': index_chunk,
#                   'response': response}
#       df_errors.loc[len(df_errors.index)] = err_row
#       raise Exception("Invalid JSON response") # will jump into upper-level try-block, thus skipping to next requirement

#   # check if all elements contain all required JSON attributes ('section_header', placeholder_text', 'replacement_text')
#   # this is basically a big try - except
#   for elem in response_json:
#     elem_keys = list(elem.keys())
#     # print(elem_keys) # DEBUG
#     if not ("section_header" in elem_keys and "placeholder_text" in elem_keys and "replacement_text" in elem_keys):
#       # GPT returned ill-formatted JSON in its response -> Skip to next requirement
#       log(f"[{log_id_doc}] ERROR - Error in populating requirement. GPT response is valid JSON, but does not contain all required fields. This event has been noted in the error table.", color="red")
#       if verbose:
#         print("Full GPT Prompt response:")
#         print(response)
#       # Append incomplete response to error capturing dataframe
#       err_row = {'RICEFW ID': df_control.loc[index_req, "RICEFW ID"],
#                   'RICEFW Name': df_control.loc[index_req, "RICEFW Name"],
#                   'LastChunk_contenttype': content_type,
#                   'LastChunk_index': index_chunk,
#                   'response': response}
#       df_errors.loc[len(df_errors.index)] = err_row
#       raise Exception("Valid, but incomplete JSON response") # will jump into upper-level try-block, thus skipping to next requirement   
  
#   # if no exceptions above (i.e. the response is valid and complete) - write response to result
#   output_dict[content_type]["chunks"][index_chunk] = response_json
  
#   # write back to control table
#   df_results.loc[index_req, "JSON"] = json.dumps(output_dict)
#   df_control.loc[index_req, "LastChunk_contenttype"] = content_type
#   df_control.loc[index_req, "LastChunk_index"] = index_chunk
#   if backup_strict: backup(df_control)

#   log(f"[{log_id_doc}] Processing {content_type} chunk {index_chunk+1}/{len(output_dict[content_type]['chunks'])} ... done.")

In [0]:
generate_output_batch(df_control,
                      df_results,
                      df_mapping,
                      df_errors,
                      df_validation,
                      folder_input,
                      index_fsd=index_fsd, #optional
                      index_bpd=index_bpd, #optional
                      index_transcripts=index_transcripts, #optional
                      verbose=False, # True will output all prompts as well as all placeholder replacements in the doc
                      validation = True, # true will store chunks prompt and response in df_validation 
                      citation=False, # True will cite instances where information is referenced
                      template_max_tokens_per_chunk=400,
                      individual_sections=False,
                      halt_on_errors = False
                      )

### 4b. Check control, result and error tables

In [0]:
df_control

# Playground

In [0]:
generate_output_batch(df_control,
                      df_results,
                      df_mapping,
                      df_errors,
                      df_validation,
                      folder_input,
                      index_fsd=index_fsd, #optional
                      index_bpd=index_bpd, #optional
                      index_transcripts=index_transcripts, #optional
                      # index_images=index_images, #optional
                      # knowledge_base_docs=knowledge_base, #optional
                      verbose=False, # True will output all prompts as well as all placeholder replacements in the doc
                      validation = True, # true will store chunks prompt and response in df_validation 
                      citation=False, # True will cite instances where information is referenced
                      template_max_tokens_per_chunk=400,
                      individual_sections=False, # experimental - use one prompt for each placeholder (instead of chunking them together). use False for default flow!
                      halt_on_errors = False
                      )

### Validation section

In [0]:
db_ws.ListFiles()

In [0]:
oldfile1='''OGE_FSD_NH-E_PA_003_IM Program Position $ to WBS $_2023-08-03_22_32_30.docx'''
oldfile2='''OGE_FSD_NH-E_PA_002_IM Program $ to IM Program Position $_2023-08-03_22_19_40.docx'''

newfile1='''OGE_FSD_NH-E_PA_003_IM Program Position $ to WBS $_2023-08-03_23_03_26.docx'''
newfile2='''OGE_FSD_NH-E_PA_002_IM Program $ to IM Program Position $_2023-08-03_22_47_44.docx'''

out1 = '''OGE_FSD_IN-I_WAM_T&D_012_T&D Integration - ArcGIS Online Inspection Results to create SAP Measurement Results_2023-10-11_03_57_30.docx'''

out2 = '''OGE_FSD_IN-I_FIN_001_Ability to send Positive Pay file to Bank of OK for check details._2023-10-11_14_26_00.docx'''
val2 = '''OGE_FSD_IN-I_FIN_001_Ability to send Positive Pay file to Bank of OK for check details._2023-06-14_01_13_32.docx'''

out3 = '''OGE_FSD_IN-E_SCM_029_Expeditor Notes Field_2023-10-11_21_18_23.docx'''
val3 = '''OGE_FSD_IN-E_SCM_029_Expeditor Notes Field_2023-06-14_01_04_44.docx'''
#new name with files i genrated todays date
#what to change name to is new file whats generated

In [0]:
db_ws.CopyFileFromWorkbench(DatabricksFolder='file:/tmp/SAP',filename=oldfile1)
db_ws.CopyFileFromWorkbench(DatabricksFolder='file:/tmp/SAP',filename=oldfile2)
db_ws.CopyFileFromWorkbench(DatabricksFolder='file:/tmp/SAP',filename=newfile1)
db_ws.CopyFileFromWorkbench(DatabricksFolder='file:/tmp/SAP',filename=newfile2)

In [0]:
# db_ws.CopyFileFromWorkbench(DatabricksFolder='file:/tmp/SAP',filename='DNU_OGE_FSD_E_PA_003_IM Program Position $ to WBS $.docx') 
db_ws.CopyFileFromWorkbench(DatabricksFolder='file:/tmp/SAP',filename=val3) 
db_ws.CopyFileFromWorkbench(DatabricksFolder='file:/tmp/SAP',filename=out3) 
dbutils.fs.ls('file:/tmp/SAP')

In [0]:
# validated_df = validate_docs('DNU_OGE_FSD_E_PA_003_IM Program Position $ to WBS $.docx', oldfile1, verbose = True)
validated_df = validate_docs(val3, out3, verbose = True)
WBS_preindex_cleaned_df = cleanup_dupes(validated_df)
WBS_preindex_cleaned_df.plot.bar(y='cos_sim')
#why team one

WBS_preindex_mean = WBS_preindex_cleaned_df['cos_sim'].mean()
print(WBS_preindex_mean)

In [0]:
# 3k = 10 run
validated_df = validate_docs('DNU_OGE_FSD_E_PA_003_IM Program Position $ to WBS $.docx', newfile1, verbose = True)
WBS_k10_cleaned_df = cleanup_dupes(validated_df)
WBS_k10_cleaned_df.plot.bar(y='cos_sim')

WBS_k10_mean = WBS_k10_cleaned_df['cos_sim'].mean()
print(WBS_k10_mean)

In [0]:
WBS_preindex_cleaned_df

In [0]:
print('pre index average ' + str(WBS_preindex_mean) + ' post index average ' + str(WBS_k10_mean))

In [0]:
db_ws.CopyFileFromWorkbench(DatabricksFolder='file:/tmp/SAP',filename='DNU_OGE_FSD_E_PA_002_IM Program $ to IM Program Position $.docx')

In [0]:
# pre bpd

validated_df = validate_docs('DNU_OGE_FSD_E_PA_002_IM Program $ to IM Program Position $.docx', oldfile2, verbose = True)
IMP_pre_cleaned_df = cleanup_dupes(validated_df)
IMP_pre_cleaned_df.plot.bar(y='cos_sim')

IMP_pre_mean = IMP_pre_cleaned_df['cos_sim'].mean()
print(IMP_pre_mean)

In [0]:
#k = 10 run
validated_df = validate_docs('DNU_OGE_FSD_E_PA_002_IM Program $ to IM Program Position $.docx', newfile2, verbose = True)
IMP_k10_cleaned_df = cleanup_dupes(validated_df)
IMP_k10_cleaned_df.plot.bar(y='cos_sim')

IMP_k10_mean = IMP_k10_cleaned_df['cos_sim'].mean()
print(IMP_k10_mean)

In [0]:
print('pre bpd score' + str(IMP_pre_mean) + ' post bpd score ' + str(IMP_k10_mean))

In [0]:
IMP_k10_cleaned_df

In [0]:
%ls /tmp/SAP/

In [0]:
IMP_k10_cleaned_df.loc['Security and controls']['Sections_source']

In [0]:
IMP_k10_cleaned_df.loc['Security and controls']['Sections_generated']

In [0]:
IMP_k10_cleaned_df.loc['Testing Scenarios & Steps']['Sections_source']

In [0]:
IMP_k10_cleaned_df.loc['Testing Scenarios & Steps']['Sections_generated']

In [0]:
## what would the similarity of one of these sections be if we word soup this?

model = sentence_transformers.SentenceTransformer("sentence-transformers/bert-base-nli-mean-tokens")

WBS_k10_cleaned_df.loc['Requirement description']['Sections_generated']

import random

def rearrange_words(string):
    words = string.split()
    random.shuffle(words)
    return " ".join(words)

string = WBS_k10_cleaned_df.loc['Requirement description']['Sections_generated']
new_string = rearrange_words(string)


source_embedding= model.encode(WBS_k10_cleaned_df.loc['Requirement description']['Sections_source'], convert_to_tensor=True)
gen_embedding = model.encode(new_string, convert_to_tensor=True)
new_score =  util.pytorch_cos_sim(source_embedding, gen_embedding).tolist()[0][0]

print(new_string)

In [0]:
local_save_folder = "/tmp/SAP/"
# doc, placeholder, originalposition = read_document(local_save_folder+ 'DNU_OGE_FSD_E_PA_002_IM Program $ to IM Program Position $.docx')
doc = Document(local_save_folder+ 'DNU_OGE_FSD_E_PA_002_IM Program $ to IM Program Position $.docx')

for paragraph in doc.paragraphs:
  if paragraph.text != '':
    print(paragraph.text)
  else:
    print(paragraph)