In [17]:
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import Optional, List
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub
from dotenv import load_dotenv

load_dotenv()

True

In [47]:
# Define Pydantic model for structured output
class Sentences(BaseModel):
    """Contains cleaned, key sentences from a document."""
    sentences: List[str]

# Define the prompt template for document analysis
system_message = """You are an expert at analyzing and extracting key sentences from a document.
1. Each key sentences can contain 01 to 03 original sentences.
2. Each key sentences should not exceed 200 characters.
3. Each key sentences should contain meaningful information.
4. Ignore key sentences that contain only numbers or special characters.
5. Ignore key sentences that contain less than 7 words.
6. For codes, always try to keep them together as one key sentence.

Here are the sentences from the document. Please truncate them into key sentences, each on a new line.
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_message),
        ("human", "{question}"),
    ]
)

# Initialize the language model with structured output
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
# Use ollama instead of llm for OpenAI language model
# Initialize the Ollama language model with structured output
# from langchain_ollama.llms import OllamaLLM
# llm = OllamaLLM(model="llama3.2:latest", temperature=0)

structured_llm = llm.with_structured_output(Sentences)

# Prepare a pipeline to process paragraphs through the LLM prompt
query_analyzer = {"question": RunnablePassthrough()} | prompt | structured_llm

In [48]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader

document_path = './Sedna.pdf.md'
loader = TextLoader(document_path)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
chunked_texts = text_splitter.split_documents(documents)

print(chunked_texts)


[Document(metadata={'source': './Sedna.pdf.md'}, page_content="<!-- image -->\n\n## Integration Support Document\n\n## Document Management\n\n| DATE       |   VERSION | AUTHOR       | REASON          |\n|------------|-----------|--------------|-----------------|\n| 19/09/2023 |         1 | RebeccaPrice | InitialDocument |\n\n## Contents\n\nContents\n\nPurpose\n\nBusiness Use Cases\n\nIntroduction\n\nGlossary\n\nEvent Stream Document Management\n\nWorkflow\n\nTechnical Guidance\n\nEvent Stream End Point\n\nMessage Endpoint\n\nJob Reference Endpoint\n\nJob Reference Field Descriptions\n\nNext Step\n\n## Purpose\n\nThis document is designed to be a high level overview of leveraging Sedna's API's for Document management purposes. For full reference to SEDNA API: https://developers.sedna.com/reference\n\n## Business Use Cases\n\n- 1. Automate the saving of documents to an alternate system\n\nSave time by allowing the users to complete a simple action to save documents within an alternate sy

In [49]:
# Extract key sentences from each paragraph
text_propositions = []

# for i, para in enumerate(chunked_texts[:15]):  # Limit to the first 5 paragraphs as an example
for i, para in enumerate(chunked_texts):  # Limit to the first 5 paragraphs as an example
    query = {"question": para}
    result = query_analyzer.invoke(query)
    sentences = result.sentences
    # split the sentences into a list of strings, with new lines
    # sentences = sentences.split("\n")
    print(sentences) 
    # Add the sentences to the text_propositions
    text_propositions.extend(sentences)
    print(f"Processed paragraph {i + 1}")

# Final output
print(f"\nExtracted {len(text_propositions)} key sentences:")
print(text_propositions)  # Display the first 10 key sentences

["This document is designed to be a high level overview of leveraging Sedna's API's for Document management purposes.", 'For full reference to SEDNA API: https://developers.sedna.com/reference', 'Save time by allowing the users to complete a simple action to save documents within an alternate system.']
Processed paragraph 1
['Using a common reference you allow ease of filing structure that is consistent across the business.', 'Create a Sedna App to be able to insert documents directly into emails from an alternate system saving time and context switching.', "Sedna's APIs allow you to be able to create integrations with your business systems to provide efficiency, accuracy and context gains.", 'Sedna recommends the following 3 options for leveraging these APIs in the Document Management space.']
Processed paragraph 2
['A green tag within Sedna is used to group messages that relate to a business transaction, such as Vessel - Voyage or Linkage ID.']
Processed paragraph 3
['A blue tag with

In [50]:
import pandas as pd

# Convert chunked_texts to a DataFrame
chunk_data = [{"Master Data": i, "content": chunk} for i, chunk in enumerate(text_propositions)]
df_chunks = pd.DataFrame(chunk_data)

csv_path = './chunked/Master.docx.csv'
df_chunks.to_csv(csv_path, index=False)

json_path = './chunked/Master.docx.json'
df_chunks.to_json(json_path, orient="records", lines=True)