In [2]:
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import Optional, List
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel

In [3]:
import requests
import json

class OllamaEndpoint:
    def __init__(self, message: str, url: str = "http://10.13.13.4:11434/api/generate", model: str = "llama3.2:latest"):
        self.message = str(message)
        self.url = url
        self.model = model
        self.base_prompt = """
        You are an expert at analyzing and extracting key sentences from a document.
        1. Each key sentence can contain 1 to 3 original sentences.
        2. Each key sentence should not exceed 200 characters.
        3. Each key sentence should contain meaningful information.
        4. Ignore key sentences that contain only numbers or special characters.
        5. Ignore key sentences that contain less than 7 words.
        6. For code, always try to keep them together as one key sentence.
        7. Dont return blank sentences, or sentence with only special characters.

        Here are the sentences from the document. Please truncate them into key sentences, each on a new line.
        Return only the key sentences, nothing else, no extra information or explanations.
        """

    def run(self):
        # Send the request to the Ollama API
        response = requests.post(
            self.url,
            json={"model": self.model, "prompt": self.base_prompt + self.message, "stream": False}
        )
        
        # Check if the response is successful
        if response.status_code != 200:
            raise Exception(f"Failed to connect: {response.status_code}")
        
        # Clean and format the JSON response
        return self._clean_json_response(response.json())

    def _clean_json_response(self, response_data):
        # Assuming the API response has a 'response' field with the raw JSON text
        response = response_data.get("response", "")
        return response
        
        # Step 1: Remove unwanted tags and newline escape characters
        # response = response.replace("<json>", "").replace("</json>", "").replace("\\n", "")
        
        # # Step 2: Parse and format the JSON
        # try:
        #     json_object = json.loads(response)
        #     formatted_json = json.dumps(json_object, indent=2)
        # except json.JSONDecodeError:
        #     raise ValueError("The response is not valid JSON.")

        # return formatted_json


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader

document_path = './Sedna.pdf.md'
loader = TextLoader(document_path)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
chunked_texts = text_splitter.split_documents(documents)

print(chunked_texts)


[Document(metadata={'source': './Sedna.pdf.md'}, page_content="<!-- image -->\n\n## Integration Support Document\n\n## Document Management\n\n| DATE       |   VERSION | AUTHOR       | REASON          |\n|------------|-----------|--------------|-----------------|\n| 19/09/2023 |         1 | RebeccaPrice | InitialDocument |\n\n## Contents\n\nContents\n\nPurpose\n\nBusiness Use Cases\n\nIntroduction\n\nGlossary\n\nEvent Stream Document Management\n\nWorkflow\n\nTechnical Guidance\n\nEvent Stream End Point\n\nMessage Endpoint\n\nJob Reference Endpoint\n\nJob Reference Field Descriptions\n\nNext Step\n\n## Purpose\n\nThis document is designed to be a high level overview of leveraging Sedna's API's for Document management purposes. For full reference to SEDNA API: https://developers.sedna.com/reference\n\n## Business Use Cases\n\n- 1. Automate the saving of documents to an alternate system\n\nSave time by allowing the users to complete a simple action to save documents within an alternate sy

In [5]:
# foreach bunk in chunked_texts, call OllamaEndpoint, and then extract key sentences from the response
key_sentences = []
i = 0
for chunk in chunked_texts:
    print(f"Chunk {i} of {len(chunked_texts)}")
    ollama_response = OllamaEndpoint(chunk, model="gemma2:9b-instruct-q8_0").run()
    ollama_response = [sentence for sentence in ollama_response.split("\n") if len(sentence) > 7]
    key_sentences.extend(ollama_response)
    i += 1
    print(ollama_response)
    print(len(key_sentences))

print(key_sentences)

Chunk 0 of 20
["This document is designed to be a high level overview of leveraging Sedna's API's for Document management purposes.", 'Save time by allowing the users to complete a simple action to save documents within an alternate system. ']
2
Chunk 1 of 20
['Using a common reference you allow ease of filing structure that is consistent across the business.', 'Create a Sedna App to be able to insert documents directly into emails from an alternate system saving time and context switching.', "Sedna's APIs allow you to be able to create integrations with your business systems to provide efficiency, accuracy and context gains. ", 'Sedna recommends the following 3 options for leveraging these APIs in the Document Management space.  ', '- 1. Save Documents with Job & Category application using the event stream', '- 2. Save Documents via a Connected App', '- 3. Insert Documents in the composer with Connected App']
9
Chunk 2 of 20
['Job Reference Tag is a green tag in Sedna used to group re

In [6]:
import pandas as pd

topic = "Sedna API"

# Convert chunked_texts to a DataFrame
chunk_data = [{topic: i, "content": chunk} for i, chunk in enumerate(key_sentences)]
df_chunks = pd.DataFrame(chunk_data)

csv_path = f'./chunked/{topic}.csv'
df_chunks.to_csv(csv_path, index=False)

json_path = f'./chunked/{topic}.json'
df_chunks.to_json(json_path, orient="records", lines=True)

In [17]:
class OllamaEmbeddingEndpoint:
    def __init__(self, message: str, url: str = "http://10.13.13.4:11434/api/embed", model: str = "nomic-embed-text:137m-v1.5-fp16"):
        self.message = str(message)
        self.url = url
        self.model = model
    
    def run(self):
        # Send the request to the Ollama API
        response = requests.post(
            self.url,
            json={"model": self.model, "input": self.message}
        )
        
        # Check if the response is successful
        response_data = response.json()
        
        # Extract embeddings from the response data, assuming it's stored under a key called "embeddings"
        embeddings = response_data.get("embeddings")
        if embeddings is None:
            raise ValueError("No embeddings found in the response.")
        
        # Return the embeddings
        return embeddings[0]


In [20]:
class SupabaseVectorStore:
    def __init__(self, url: str, token: str, table_name: str):
        self.url = url
        self.token = token
        self.table_name = table_name
        self.headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.token}",
            "apikey": self.token  # Supabase requires both Authorization and apikey headers
        }

    def insert_embedding(self, text: str, embedding: list[float]):
        """
        Inserts an embedding into the Supabase Postgres database.
        
        :param text: The original text for which the embedding was created.
        :param embedding: A list of floats representing the embedding vector.
        :return: The response from the Supabase API call.
        """
        data = {
            "content": text,
            "metadata": {},
            "embedding": embedding
        }
        
        response = requests.post(
            f"{self.url}/rest/v1/{self.table_name}",
            headers=self.headers,
            json=data
        )

        # Check if the insertion was successful
        if response.status_code != 201:
            raise Exception(f"Failed to insert embedding: {response.status_code}, {response.text}")
        
        return True


In [21]:
# Define Supabase credentials
SUPABASE_URL = "http://10.13.13.4:8000"
SUPABASE_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyAgCiAgICAicm9sZSI6ICJhbm9uIiwKICAgICJpc3MiOiAic3VwYWJhc2UtZGVtbyIsCiAgICAiaWF0IjogMTY0MTc2OTIwMCwKICAgICJleHAiOiAxNzk5NTM1NjAwCn0.dc_X5iR_VP_qT0zsiyj_I_OZ2T9FtRU2BBNWN8Bu4GE"
TABLE_NAME = "n8n_documents_norm"

supabase = SupabaseVectorStore(SUPABASE_URL, SUPABASE_TOKEN, TABLE_NAME)
embedings_result = []
i = 0
for sentence in key_sentences:
    print(f"Sentence {i} of {len(key_sentences)}")
    embeding = OllamaEmbeddingEndpoint(sentence).run()
    embedings_result.append(embeding)

    # Insert the embedding into the Supabase database
    supabase.insert_embedding(sentence, embeding)
    
    i += 1
    print(len(embedings_result))

Sentence 0 of 75
1
Sentence 1 of 75
2
Sentence 2 of 75
3
Sentence 3 of 75
4
Sentence 4 of 75
5
Sentence 5 of 75
6
Sentence 6 of 75
7
Sentence 7 of 75
8
Sentence 8 of 75
9
Sentence 9 of 75
10
Sentence 10 of 75
11
Sentence 11 of 75
12
Sentence 12 of 75
13
Sentence 13 of 75
14
Sentence 14 of 75
15
Sentence 15 of 75
16
Sentence 16 of 75
17
Sentence 17 of 75
18
Sentence 18 of 75
19
Sentence 19 of 75
20
Sentence 20 of 75
21
Sentence 21 of 75
22
Sentence 22 of 75
23
Sentence 23 of 75
24
Sentence 24 of 75
25
Sentence 25 of 75
26
Sentence 26 of 75
27
Sentence 27 of 75
28
Sentence 28 of 75
29
Sentence 29 of 75
30
Sentence 30 of 75
31
Sentence 31 of 75
32
Sentence 32 of 75
33
Sentence 33 of 75
34
Sentence 34 of 75
35
Sentence 35 of 75
36
Sentence 36 of 75
37
Sentence 37 of 75
38
Sentence 38 of 75
39
Sentence 39 of 75
40
Sentence 40 of 75
41
Sentence 41 of 75
42
Sentence 42 of 75
43
Sentence 43 of 75
44
Sentence 44 of 75
45
Sentence 45 of 75
46
Sentence 46 of 75
47
Sentence 47 of 75
48
Sentence 48