### Converting pdf pages into markdown using LlamaParse

In [1]:
import os
from rich import print
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from llama_parse import LlamaParse
import nest_asyncio

nest_asyncio.apply()

# converting to markdown
markdown_documents = LlamaParse(api_key=os.environ['LLAMA_PARSE_API_KEY'],
                      result_type='markdown').load_data('data/The-Army-Regulations.pdf')

Started parsing the file under job_id d8c423cd-e172-4e0e-b866-765dcdd9c0eb


In [4]:
markdown_documents

[Document(id_='7686e7e7-50bf-4ab7-9d35-3d122d694a11', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='# The Army Regulations\n\n# Volume I- (Rules)\n\n# 1975', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='0e9521c4-eaf7-4abc-9012-657be1802af6', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='# AMENDMENTS\n\n1. AR (R) Volume I Rules 253 is amended as under:-\n\nIn para (e) Sub-para (1) Add the fol at the end \'\'after non-regular personnel\'\'\n\n\'\'Whenever their services are no longer required, for causes other than those calling for dismissal, removal or premature retirement. Grant of retiring benefits is regulated under normal rules\'\'.\n\nAuthority:- AHQ Letter No. 0024G/10/CAO/A-2 dt. 19 July 76\n2. Appendi

In [5]:
print(markdown_documents[4].text)

In [6]:
from tqdm.auto import tqdm

pages_to_exclude = [1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 227, 229, 230]

# saving each page in a separate markdown
for page_number, page in tqdm(enumerate(markdown_documents, 1)):
    if page_number not in pages_to_exclude:
        with open(f'markdown/{page_number}.md', 'w') as markdown_file:
            markdown_file.write(page.text)
        markdown_file.close()

print("\nSuccessfully converted all pages into markdowns")

  from .autonotebook import tqdm as notebook_tqdm
280it [00:00, 12807.59it/s]


### Converting Llamaindex document to Langchain document

In [9]:
from langchain_core.documents import Document

langchain_docs = []
for page_number, page in enumerate(markdown_documents, 1):
    if page_number not in pages_to_exclude:
        langchain_docs.append(Document(page_content=page.text,
                                       metadata={"page_number": page_number}))

In [10]:
len(langchain_docs)

267

In [12]:
print(langchain_docs[59].metadata)
print(langchain_docs[59].page_content)

### Creating embedding from langchain documents

In [13]:
from langchain_chroma import Chroma
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# loading the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="Alibaba-NLP/gte-base-en-v1.5",
                                        cache_folder="embedding_model",
                                        model_kwargs={"device": 'cuda:0',
                                                      "trust_remote_code":True},
                                        encode_kwargs={"batch_size": 1})

In [14]:
persistent_directory = os.path.join(os.path.dirname(os.path.abspath("__file__")), "db", "markdown_chunk_db")

# load or create vector database
if os.path.exists(persistent_directory):
    print("Vector db already exists, loading the database.")
    db = Chroma(embedding_function=embedding_model, persist_directory=persistent_directory)
else:
    print("No database found, creating new database.")
    db = Chroma.from_documents(documents=langchain_docs,
                               embedding=embedding_model,
                               persist_directory=persistent_directory)