# Experimenting with open LLMs 


In [2]:
from ollama import chat

In [3]:
import asyncio
from ollama import AsyncClient

async def chat():
  message = {'role': 'user', 'content': 'Why is the sky blue?'}
  response = await AsyncClient().chat(model='gemma3', messages=[message])

In [4]:
# response=await chat()

In [5]:
print(response)

NameError: name 'response' is not defined

In [13]:
import os
from dotenv import load_dotenv

# Define the path to your .env file
# This could be an absolute or relative path
env_path = '/Users/himanshu/projects/journalling-assitant/.env' 
# or relative: env_path = '../config/.env'

# Load the .env file from the specified path
load_dotenv(dotenv_path=env_path)

# Now you can access the variables as usual
HF_TOKEN = os.getenv("HF_TOKEN")

# print(f"SECRET_KEY: {secret_key}")

In [12]:
import os
from openai import OpenAI


client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.environ["HF_TOKEN_2"],
)

completion = client.chat.completions.create(
    model="openai/gpt-oss-20b",
    messages=[
        {
            "role": "user",
            "content": "Write a dialogue between two characters meeting for the first time."
        }
    ],
)

print(completion.choices[0].message)

ChatCompletionMessage(content='**Maya:** Excuse me, is this seat taken?\n\n**Alex:** Nope, go ahead. But if I do sit down, you may already know my name—Alex. (He offers her a friendly grin.)\n\n**Maya:** Maya. (She closes her notebook, smiles nervously.) I was hoping someone more... I mean, no, I\'d rather not say what I was hoping you were. My flight was delayed, and I need a place to think.\n\n**Alex:** All right. (He pulls out a cup of coffee he ordered earlier.) Here, take a sip. It looks like you have a storm brewing in that pile of papers.\n\n**Maya:** (Laughs softly.) It’s a thesis proposal. I keep getting distracted by random thoughts. Do you... do you mind if I keep it—(she glances around—if I keep it here?\n\n**Alex:** Not at all. (He nods toward the book that’s open in front of him.) A piece of work? What are you up to?\n\n**Maya:** Just trying to convince the committee that my argument about *quantum linguistics* is, uh, legitimate. (She pushes the paper to show an outline.

# Creating a rag db

In [16]:
import sqlite3
import sqlite_vec

# Parsing using llamaparse

In [14]:
from llama_cloud_services import LlamaParse

parser = LlamaParse(
  # See how to get your API key at https://docs.cloud.llamaindex.ai/api_key
  api_key=os.environ["LLAMAPARSE_API"],

  # The parsing mode
  parse_mode="parse_page_with_llm",

  # Whether to use high resolution OCR (Slow)
  high_res_ocr=True,

  # Adaptive long table. LlamaParse will try to detect long table and adapt the output
  adaptive_long_table=True,

  # Whether to try to extract outlined tables
  outlined_table_extraction=True,

  # Whether to output tables as HTML in the markdown output
  output_tables_as_HTML=True,
)



# Example usage:

# sync
result = parser.parse("/Users/himanshu/projects/journalling-assitant/Books/Medium Discourses/Middle-Discourses-sujato-2025-08-25-1.pdf")

# sync batch
# results = parser.parse(["./my_file1.pdf", "./my_file2.pdf"])

# async
# result = await parser.aparse("./my_file.pdf")

# async batch
# results = await parser.aparse(["./my_file1.pdf", "./my_file2.pdf"])



Started parsing the file under job_id fc6b51ce-9df5-4e27-ba75-606609192dd2
.

In [26]:
# get the llama-index markdown documents
markdown_documents = result.get_markdown_documents(split_by_page=True)

# # get the llama-index text documents
# text_documents = result.get_text_documents(split_by_page=False)

# # get the image documents
# image_documents = result.get_image_documents(
#     include_screenshot_images=True,
#     include_object_images=False,
#     # Optional: download the images to a directory
#     # (default is to return the image bytes in ImageDocument objects)
#     image_download_dir="./images",
# )

# access the raw job result
# Items will vary based on the parser configuration
for page in markdown_documents[:10]:
    print(page)
    # print(page.md)
    # print(page.images)
    # print(page.layout)
    # print(page.structuredData)

Doc ID: dd486cb9-a118-4413-b792-52d1389d2a56
Text: # Middle Discourses  # Bhikkhu Sujato
Doc ID: 9a36baa9-e1ea-40b4-b96a-3e29c25aa5a9
Text: NO_CONTENT_HERE
Doc ID: 34e59411-5e5d-42f9-872c-998b75ab1f47
Text: M I D D L E D I S C O U R S E S  # A lucid translation of the
Majjhima Nikāya  translated and introduced by Bhikkhu Sujato  # Volume
1  # MN 1–50  # The First Fifty  Mūl a paṇṇā s a  0 SuttaCentral
Doc ID: 8586e42d-4ef2-4f92-9a03-604d6b67d389
Text: # Middle Discourses  # A translation of the Majjhimanikāya by
Bhikkhu Sujato  Creative Commons Zero (CC0)  To the extent possible
under law, Bhikkhu Sujato has waived all copyright and related or
neighboring rights to Middle Discourses.  This work is published from
Australia. This translation is an expression of an ancient spiritual
text that ha...
Doc ID: 0d29cd97-305e-4a27-a6df-09a6510bc6b5
Text: The sage at peace is not reborn, does not grow old, and does not
die. They are not shaken, and do not yearn. For they have nothing
which would

# Converting html to markdowns

In [30]:
import requests
from markdownify import markdownify as md

# The URL of the page you want to convert
# Example: a page from Access to Insight containing Buddhist texts
url = 'https://raw.githubusercontent.com/suttacentral/editions/main/en/sujato/dn/html/Long-Discourses-sujato-2025-08-25.html'

try:
    # Step 1: Fetch the HTML content from the URL
    response = requests.get(url)
    response.raise_for_status()  # Raises an exception for bad status codes (4xx or 5xx)
    html_content = response.text

    # Step 2: Convert the HTML to Markdown
    # The 'heading_style="ATX"' argument ensures headings are created with '#'
    markdown_text = md(html_content, heading_style="ATX")

    # Step 3: Save the result to a file
    with open('dhammapada_chapter1.md', 'w', encoding='utf-8') as f:
        f.write(markdown_text)

    print("Successfully converted HTML to Markdown!")
    # print("\n--- Preview ---\n")
    # print(markdown_text[:500]) # Print the first 500 characters as a preview

except requests.exceptions.RequestException as e:
    print(f"Error fetching the URL: {e}")

Successfully converted HTML to Markdown!


In [28]:
from langchain_community.vectorstores import PGVector
# from Pipeline.config import *
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import AzureBlobStorageFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from azure.storage.blob import BlobServiceClient
from tenacity import retry, stop_after_attempt, wait_exponential
import asyncio
import datetime
import logging
import base64
from semantic_router.encoders import OpenAIEncoder
from langchain.text_splitter import MarkdownHeaderTextSplitter
import datetime
from semantic_chunkers import StatisticalChunker
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.schema import Document
import re
from langfuse.decorators import observe, langfuse_context
from tenacity import retry, stop_after_attempt, wait_exponential


### Define chunking function 


@observe(capture_input=True, capture_output=True)
async def semantically_chunking(text):
    """Chunk text into smaller pieces using a rolling window splitter of semantic router """
    try:
        chunked_texts = []
        encoder = OpenAIEncoder(name="text-embedding-ada-002")
        chunker = StatisticalChunker(encoder=encoder,min_split_tokens=200,max_split_tokens=2000,window_size=2)
        chunks_async = await chunker.acall(docs=[text])
        for chunk in chunks_async[0]:
            chunk_text = ""
            for split in chunk.splits:
                chunk_text += split
            ## Remove null characters from the text
            if '\x00' in chunk_text:
                    chunk_text=text.replace('\x00', '')
            chunked_texts.append(chunk_text)
        logging.info(f'Number of chunks for given text is {len(chunked_texts)}')
        logging.info("*"*100)
        return chunked_texts
    except Exception as e:
        logging.error(f"Error in semantically_chunking: {e}")
        raise e


@observe(capture_input=True, capture_output=True)
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def markdown_chunking(text, max_chunk_size=1000, level=1, max_level=6):

    """
    Objective: Chunk text into smaller pieces using markdown splitter.



    Inputs:
    - text: str: Text to be chunked
    - max_chunk_size: int: Maximum size of each chunk
    - level: int: Current header level
    - max_level: int: Maximum header level to split on

    Workflow and Steps:

    1. Split the text into smaller chunks using markdown splitter.
    2. If the chunk is larger than the max_chunk_size and the current level is less than the max_level, recursively call the markdown_chunking function with the chunk as input.
    3. If the chunk is smaller than the max_chunk_size or the current level is equal to the max_level, keep the chunk as is.
    4. Return the list of chunks.

    Outputs:
    - docs: list: List of Document objects, each containing a chunk of text
    - level: int: Current header level


    """
    def add_header_level(level):
        return [(("#" * i), f"Header {i}") for i in range(1, level + 1)]
    
    
    def get_headers_string(chunk_metadata):
        header_string=""
        for header in chunk_metadata:
            header_level=header[-1]
            header_text=chunk_metadata[header]
            header_string+=f"{'#'*int(header_level)} {header_text}\n"
        return header_string
    

    def remove_existing_headers(content: str) -> str:
        # Remove header lines (lines starting with #)
        return re.sub(r'^#+\s.*$', '', content, flags=re.MULTILINE).strip()

    headers_to_split_on = add_header_level(level)
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on, strip_headers=False
        )
    
    md_header_splits = markdown_splitter.split_text(text)

    docs = []
    for doc in md_header_splits:
        if len(doc.page_content) > max_chunk_size and level < max_level:

            new_chunks, _ = await markdown_chunking(doc.page_content, max_chunk_size, level + 1, max_level)

            ## Prepand each chunk with header from metadata
            new_chunks = [Document(get_headers_string(chunk.metadata)+remove_existing_headers(chunk.page_content),metadata=chunk.metadata) for chunk in new_chunks]
            docs.extend(new_chunks)
        else:
            # If chunk is small enough or we've reached max level, keep it as is

            docs.append(Document(get_headers_string(doc.metadata)+remove_existing_headers(doc.page_content), metadata=doc.metadata))
    
    return docs, level

@observe(capture_input=False, capture_output=False)
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def chunk_text(text,method='semantic'):
    """
    Wrapper function to chunk text using either semantic router or markdown splitter.

    """
    if method=='semantic':
        logging.info("Chunking text using semantic router")
        return await semantically_chunking(text)
        
    elif method=='markdown':
        logging.info("Chunking text using markdown splitter")
        langchain_docs, level = await markdown_chunking(text)
        ## Get the text from the langchain docs 
        chunks = [doc.page_content for doc in langchain_docs]
        return chunks
        
    else:
        logging.error("Invalid method for chunking")
        raise ValueError("Invalid method for chunking")


@observe(capture_input=False, capture_output=False)
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def embed_in_vdb(slug,texts,metadatas): 

    """
    Objective: Embed the text data and save it in relevant langchain collection in vector database.

    Inputs:
    - slug: str: Slug of the company, used to name the collection in the vector database
    - texts: list: List of text data to be embedded
    - metadatas: list: List of metadata associated with each text data

    Outputs:
    - created_object_list: list: List of created objects in the table

    """
        
    ## Embed the data and save it in langchain collection ## 
    CONNECTION_STRING = f"postgresql+psycopg2://{user}:{password}@{host}/{dbname}"
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    COLLECTION_NAME=slug+"_vdb"
    ## Connecting to vector store ##
    vector_store = PGVector(
        collection_name=COLLECTION_NAME,
        connection_string=CONNECTION_STRING,
        embedding_function=embeddings,
    )
    created_object_list=await vector_store.aadd_texts(texts,metadatas)
    # print(f'Created {len(created_object_list)} new embeddings in the table')
    return created_object_list





ModuleNotFoundError: No module named 'langchain_community'

In [24]:
len(result.pages)

519

In [29]:
import sqllite

ModuleNotFoundError: No module named 'sqllite'

# Trying out PAGEINDEXRAG
