In [81]:
from docx import Document
import os
import re
import streamlit as st
import openai
import time

In [82]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.rl_config import defaultPageSize
from reportlab.lib import colors

In [83]:
# Set the openai api-key using streamlit secrets:
openai.api_key = st.secrets["OPENAI_API_KEY"]

In [84]:
def remove_images_from_docx(input_docx_file: str, cleaned_doc: Document) -> Document:
    """
    Removes the images of a word document and stores the paragraphs inside the cleaned_doc object
    """
    # Open the input Word document
    doc = Document(input_docx_file)

    # Collect original paragraphs:
    original_paragraphs = [para.text for para in doc.paragraphs]

    for text in original_paragraphs:
        cleaned_doc.add_paragraph(text)
    
    # Return the document without images
    return cleaned_doc

def clean_file_name(file_name: str) -> str:
    """
    Cleans the file name to eliminate numbers and other things that add no value such as "1.", "02.A", ...
    """
    # First remove the .docx:
    cleaned_file_name = file_name.replace(".docx", "")
    # Remove the numbers with parenthesis:
    cleaned_file_name = re.sub(r"\(\d+\)", "", cleaned_file_name)
    # Remove the special cases such as "02A", "05A":
    cleaned_file_name = re.sub(r"\d{2}[A-Za-z]\.", "", cleaned_file_name)
    # Remove the nomral cases of the numeration of the documents:
    cleaned_file_name = re.sub(r"\d+\.|\d+", "", cleaned_file_name)

    # Remove too paters such as "WI-CS-XX", "WI-CS", "WIP":
    cleaned_file_name = re.sub(r"[A-Z]{2}-[A-Z]{2}-[A-Z]{2}", "", cleaned_file_name)
    cleaned_file_name = re.sub(r"WIP", "", cleaned_file_name)
    cleaned_file_name = re.sub(r"[A-Z]{2}-[A-Z]{2}", "", cleaned_file_name)
    cleaned_file_name = re.sub(r"[A-Z]{2}-\d{2}", "", cleaned_file_name)
    cleaned_file_name = re.sub(r"WI", "", cleaned_file_name)

    # To end remove the residual "-" characters:
    cleaned_file_name = cleaned_file_name.replace("-", "")

    # Add a point and a space to the end:
    cleaned_file_name = cleaned_file_name + ". "

    return cleaned_file_name

def chat(inp, message_history, role="user"):
    
    message_history.append({"role": role, "content": inp})

    # Use the large models so it can summarize large documents:
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        messages=message_history
    )

    reply_content = completion.choices[0].message.content

    return reply_content

def extract_text(document: Document) -> str:
    """
    Extracts all the text from a Document() end returns it in a string
    """

    # Define the variable to store the text to extract:
    extracted_text = ""

    paragraphs_text = [para.text for para in document.paragraphs]
    for text in paragraphs_text:
        extracted_text = extracted_text + text

    return extracted_text

def delete_page_breaks(input_string: str) -> str:
    """
    Deletes the page breaks of a string
    """

    return input_string.replace("\n", "")

def create_pdf(summarized_text: str, pdf_name: str) -> None:
    """
    Takes the text and generates a pdf named using the passed pdf_name that contains the text
    """

    # Create a document template
    pdf_doc = SimpleDocTemplate(pdf_name, pagesize=letter)

    # Create a list to hold the flowables (elements to be added to the PDF)
    story = []

    # Define a style for the paragraph
    styles = getSampleStyleSheet()
    style = styles["Normal"]
    style.textColor = colors.black

    # Create a Paragraph object with your text and style
    paragraph = Paragraph(summarized_text, style)

    # Add the paragraph to the story
    story.append(paragraph)

    # Build the PDF with the story
    pdf_doc.build(story)

def return_message_history(mode: str) -> dict:
    """
    Returns the context for chatgpt depending of the mode: "summarizer" to summarize the documents or "how_to" to modify the titles
    of the documents to put them in the format "How to ..."
    """

    if mode == "summarizer":
        message_history = [
            {
                "role": "user",
                "content": "I will give you a text that contains information about how to carry out a process extracted from a document used by the employees of a company. I need you to compact all the points in one paragraph without losing information. Can you do it?"
            },
            {
                "role": "assistant",
                "content": "Of course, I can help you with that. Please provide the text containing the process information, and I will condense it into a single paragraph while retaining all the essential details."
            }
        ]

    if mode == "how_to":
        message_history = [
            {
                "role": "user",
                "content": "I will give you a text that contains information about how to carry out a process extracted from a document used by the employees of a company. I need you to compact all the points in one paragraph without losing information. Can you do it?"
            },
            {
                "role": "assistant",
                "content": "Of course, I can help you with that. Please provide the text containing the process information, and I will condense it into a single paragraph while retaining all the essential details."
            }
        ]

    return message_history

In [87]:
for original_file in os.listdir("original_docs"):
    # Create the new document where we will add all the cleaned info:
    cleaned_doc = Document()

    # We have to clean the titles of the documents to add them inside the document.
    cleaned_file_name = clean_file_name(original_file)

    # Add the title of the doc:
    cleaned_doc.add_paragraph(cleaned_file_name)

    # Remove all the images:
    cleaned_doc = remove_images_from_docx(
        input_docx_file = "original_docs/" + original_file,
        cleaned_doc = cleaned_doc
    )

    # Save the doc to check that everything goes according to plan:
    # cleaned_doc.save("data/" + "no images " + original_file)

    # Extract all the text:
    extracted_text = extract_text(cleaned_doc)

    # Remove page breaks:
    extracted_text = delete_page_breaks(extracted_text)

    print(f"Summarizing {original_file}...")
    # set message history with the context:
    summary_message_history = return_message_history(mode="summarizer")
    # Sumarize the text in one paragraph with chatgpt:
    summarized_text = chat(extracted_text, summary_message_history)

    # Create the pdf using the summarized text:
    create_pdf(summarized_text, pdf_name="data/" + cleaned_file_name + ".pdf")

Summarizing 02A. How To Send The Supplier A Purchase Order Created From A Purchase Request - no printer setup(1).docx...
Summarizing 17. How to Manage a CAPEX Purchase Requisition(1).docx...
Summarizing WI 02 On Hand Stock.docx...
Summarizing 02. How To Send The Supplier A Purchase Order Created From A Purchase Request(1).docx...
Summarizing WI 25 Changing a delivery address in a Confirmed Sales Order(1).docx...
Summarizing 23. How To Manage Non Conformance Set Up(1).docx...
Summarizing WI-CS-XX - Consignement Stock.docx...
Summarizing 37. How to Create a Workspace Tile for Open Service POs(1).docx...
Summarizing 11. How To Create A New Vendor(1).docx...
Summarizing 14. How To Create A Non Conformance(3).docx...
Summarizing 29. Request Changes in PO(1).docx...
Summarizing WI 13 Set up requirements for sales order.docx...
Summarizing 36. Assigning a Vendor Chain(1).docx...
Summarizing WI 11 Trade Agreement from a Sales Order.docx...
Summarizing 35. How To add Extra Charges to a Purchase

# Esto ya no

In [5]:
from openai import OpenAI
import os
import streamlit as st

os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]

client = OpenAI()

assistant = client.beta.assistants.create(
    name="Math Tutor",
    instructions="You are a personal math tutor. Write and run code to answer math questions.",
    tools=[{"type": "code_interpreter"}],
    model="gpt-4-1106-preview"
)

In [6]:
thread = client.beta.threads.create()

In [7]:
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content="I need to solve the equation `3x + 11 = 14`. Can you help me?"
)

In [8]:
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id,
  instructions="Please address the user as Jane Doe. The user has a premium account."
)

In [9]:
run = client.beta.threads.runs.retrieve(
  thread_id=thread.id,
  run_id=run.id
)


In [10]:
messages = client.beta.threads.messages.list(
  thread_id=thread.id
)

SyncCursorPage[ThreadMessage](data=[ThreadMessage(id='msg_31Timyi0fHwNMEsOdzqsa7SS', assistant_id=None, content=[MessageContentText(text=Text(annotations=[], value='I need to solve the equation `3x + 11 = 14`. Can you help me?'), type='text')], created_at=1700576521, file_ids=[], metadata={}, object='thread.message', role='user', run_id=None, thread_id='thread_2TItk7Ipjlo9snyN5UGAomhv')], object='list', first_id='msg_31Timyi0fHwNMEsOdzqsa7SS', last_id='msg_31Timyi0fHwNMEsOdzqsa7SS', has_more=False)

Youtube tuorial:

In [1]:
from openai import OpenAI
import streamlit as st
import os

os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
client = OpenAI()

In [16]:
assistant = client.beta.assistants.create(
    name = "Math tutor 1",
    instructions = "You are a personal math teacher. Write and run code to answer math questions",
    tools = [{"type": "code_interpreter"}],
    model = "gpt-4"
)

In [17]:
print(assistant)

Assistant(id='asst_XYotmVQS0fwmZ32NC0XqKKdc', created_at=1700583039, description=None, file_ids=[], instructions='You are a personal math teacher. Write and run code to answer math questions', metadata={}, model='gpt-4', name='Math tutor 1', object='assistant', tools=[ToolCodeInterpreter(type='code_interpreter')])


In [18]:
thread = client.beta.threads.create()

In [19]:
print(thread)

Thread(id='thread_0Pmg8I8bJZ5o0PCZnzyAGQwx', created_at=1700583040, metadata={}, object='thread')


In [31]:
message = client.beta.threads.messages.create(
    thread_id = thread.id,
    role = "user",
    content = "Solve this problem: 3x² -11 = 14"
)

In [32]:
print(message)

ThreadMessage(id='msg_jGPFBckDNqC6sy9XByByBMsU', assistant_id=None, content=[MessageContentText(text=Text(annotations=[], value='Solve this problem: 3x² -11 = 14'), type='text')], created_at=1700583804, file_ids=[], metadata={}, object='thread.message', role='user', run_id=None, thread_id='thread_0Pmg8I8bJZ5o0PCZnzyAGQwx')


In [33]:
run = client.beta.threads.runs.create(
    thread_id = thread.id,
    assistant_id = assistant.id
)

In [36]:

run = client.beta.threads.runs.retrieve(
    thread_id = thread.id,
    run_id = run.id
)

In [37]:
print(run)

Run(id='run_w7OKpJgP4078VpkYZ8Ij4KAv', assistant_id='asst_XYotmVQS0fwmZ32NC0XqKKdc', cancelled_at=None, completed_at=1700583814, created_at=1700583808, expires_at=None, failed_at=None, file_ids=[], instructions='You are a personal math teacher. Write and run code to answer math questions', last_error=None, metadata={}, model='gpt-4', object='thread.run', required_action=None, started_at=1700583808, status='completed', thread_id='thread_0Pmg8I8bJZ5o0PCZnzyAGQwx', tools=[ToolAssistantToolsCode(type='code_interpreter')])


In [38]:
run.stat

'completed'

In [39]:
messages = client.beta.threads.messages.list(
    thread_id = thread.id
)

In [40]:
for message in reversed(messages.data):
    print(f"{message.role}: {message.content[0].text.value}")

user: Solve this problem: 3x + 11 = 14
user: Solve this problem: 3x² -11 = 14
assistant: The solutions to the equation 3x² -11 = 14 are x = -5√3/3 and x = 5√3/3


In [12]:
print(messages.data)

[ThreadMessage(id='msg_nQmTBgjFgP3JqCRf62omikoD', assistant_id=None, content=[MessageContentText(text=Text(annotations=[], value='Solve this problem: 3x + 11 = 14'), type='text')], created_at=1700582993, file_ids=[], metadata={}, object='thread.message', role='user', run_id=None, thread_id='thread_oBF5ZXYVYXjbvxeg9CcgUObr')]


In [13]:
print(messages)

SyncCursorPage[ThreadMessage](data=[ThreadMessage(id='msg_nQmTBgjFgP3JqCRf62omikoD', assistant_id=None, content=[MessageContentText(text=Text(annotations=[], value='Solve this problem: 3x + 11 = 14'), type='text')], created_at=1700582993, file_ids=[], metadata={}, object='thread.message', role='user', run_id=None, thread_id='thread_oBF5ZXYVYXjbvxeg9CcgUObr')], object='list', first_id='msg_nQmTBgjFgP3JqCRf62omikoD', last_id='msg_nQmTBgjFgP3JqCRf62omikoD', has_more=False)


In [14]:
print(run)

Run(id='run_TTBxO9DazWpP99PR3W6jxWRz', assistant_id='asst_B8uZjNOcPyDaiB2yOG8XoujC', cancelled_at=None, completed_at=None, created_at=1700582994, expires_at=1700583594, failed_at=None, file_ids=[], instructions='You are a personal math teacher. Write and run code to answer math questions', last_error=None, metadata={}, model='gpt-4', object='thread.run', required_action=None, started_at=1700582995, status='in_progress', thread_id='thread_oBF5ZXYVYXjbvxeg9CcgUObr', tools=[ToolAssistantToolsCode(type='code_interpreter')])


## Creation of the database

In [1]:
from memory_chat_utils.vector_database import VectorDatabase
from langchain.vectorstores import FAISS

In [2]:
VectorDatabase(data_path="/home/leibniz/Desktop/herogra-assistant/data/data_by_sections", action="store")

Creating vectordatabase...
Vectordatabase successfully created and stored


<memory_chat_utils.vector_database.VectorDatabase at 0x7ffa88a53d60>

In [3]:
vector_database = VectorDatabase(data_path="/home/leibniz/Desktop/herogra-assistant/data/data_by_sections", action="load")

Loading database...
Database loaded


In [4]:
vector_database.run_query("Que productos pueden causar irritación en los ojos?")

RAW DATABASE OUTPUT:
[(Document(page_content='irritación ocular grave. La clasificación del producto incluye categorías como lesión\nocular grave, irritación ocular, sólido comburente y corrosivo cutáneo. Los cambios en el producto', metadata={'source': '/home/leibniz/Desktop/herogra-assistant/data/data_by_sections/AAR315 - 4+2+6 EXTRA NA-V FERTIGOTA  Seccion_16.pdf', 'page': 0}), 0.28440893), (Document(page_content='la\nirritación ocular. Se deben quitar las prendas contaminadas y eliminar el contenido y/o recipiente de manera\nadecuada. El producto contiene sulfato de hierro (II). No se clasifican como PBT, mPmB,', metadata={'source': '/home/leibniz/Desktop/herogra-assistant/data/data_by_sections/BOE607 - 6+10+12 HEROSOL MULTICOLOR  Seccion_2.pdf', 'page': 0}), 0.29572254), (Document(page_content='de peligro definidas en el\nReglamento (CE) nº 1272/2008. Las salpicaduras en los ojos pueden causar irritación y daños reversibles.\nTambién se detalla información toxicológica de las sust

'Información producto AAR3154+2+6 EXTRA NA-V FERTIGOTA: irritación ocular grave. La clasificación del producto incluye categorías como lesión ocular grave, irritación ocular, sólido comburente y corrosivo cutáneo. Los cambios en el producto. de peligro definidas en el Reglamento (CE) nº 1272/2008. Las salpicaduras en los ojos pueden causar irritación y daños reversibles. También se detalla información toxicológica de las sustancias. que el producto provoca quemaduras graves en la piel y lesiones oculares graves. Se dan consejos de prudencia que incluyen no respirar el polvo, humo, gas, niebla, vapores o aerosol, lavarse las manos \nInformación producto BOE6076+10+12 HEROSOL MULTICOLOR: la irritación ocular. Se deben quitar las prendas contaminadas y eliminar el contenido y/o recipiente de manera adecuada. El producto contiene sulfato de hierro (II). No se clasifican como PBT, mPmB,. como nociva en caso de ingestión, provoca irritación ocular grave y cutánea. El etiquetado se realiza de

In [13]:
query = "Que productos pueden causar lesiones oculares?"
vector_store_local.similarity_search_with_score(query, 10)

[(Document(page_content='irritación ocular grave. La clasificación del producto incluye categorías como lesión\nocular grave, irritación ocular, sólido comburente y corrosivo cutáneo. Los cambios en el producto', metadata={'source': '/home/leibniz/Desktop/herogra-assistant/data/data_by_sections/AAR315 - 4+2+6 EXTRA NA-V FERTIGOTA  Seccion_16.pdf', 'page': 0}),
  0.2773382),
 (Document(page_content='que el producto provoca quemaduras graves en la piel y\nlesiones oculares graves. Se dan consejos de prudencia que incluyen no respirar el polvo, humo, gas, niebla,\nvapores o aerosol, lavarse las manos', metadata={'source': '/home/leibniz/Desktop/herogra-assistant/data/data_by_sections/AAR315 - 4+2+6 EXTRA NA-V FERTIGOTA  Seccion_2.pdf', 'page': 0}),
  0.29691175),
 (Document(page_content='de peligro definidas en el\nReglamento (CE) nº 1272/2008. Las salpicaduras en los ojos pueden causar irritación y daños reversibles.\nTambién se detalla información toxicológica de las sustancias', metada

In [12]:
import streamlit as st
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
from semantic_kernel.connectors.ai.chat_request_settings import ChatRequestSettings

In [13]:
model_name = "gpt-4-32k"
deployment_name = f"{model_name}-herogra"

llm = AzureChatCompletion(
    deployment_name = deployment_name, 
    endpoint = st.secrets["ENDPOINT"], 
    api_key = st.secrets["AZURE_API_KEY"]
)

'azure'

TypeError: __init__() missing 1 required positional argument: 'deployment_name'

In [9]:
chat._model_id

AttributeError: 'ChatRequestSettings' object has no attribute '_model_id'