In [13]:
#     strategy="sections"
from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader
loader = LLMSherpaFileLoader(
    file_path="https://arxiv.org/pdf/1910.13461.pdf",
    new_indent_parser=True,
    apply_ocr=True,
    strategy="sections",
    llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all",
)
section_docs = loader.load()

In [14]:
section_docs

[Document(metadata={'source': 'https://arxiv.org/pdf/1910.13461.pdf', 'section_number': 0, 'section_title': '2.1      Architecture'}, page_content='2.1      Architecture\nture  \n \n  BART      uses      the      standard      sequence-to-sequence      Trans-  \n \n  former      architecture      from      (Vaswani      et      al.,      2017),      ex-  \n \n  cept,      following      GPT,      that      we      modify      ReLU      activa-  \n \n  tion      functions      to      GeLUs      (Hendrycks   \n \n&      Gimpel,      2016)  \n \n  and      initialise      parameters      from      A/(0,0.02).\nFor      our  \n \n  base      model,      we      use   \n \n6      layers      in      the      encoder      and      de-  \n \n  coder,      and      for      our      large      model      we      use      12      layers      in  \n \n  each.\nThe      architecture      is      closely      related      to      that      used      in  \n \n  BERT,      with      the      follow

In [15]:
class Document:
    def __init__(self, metadata, page_content):
        self.metadata = metadata
        self.page_content = page_content

# List of Document objects
section_documents =section_docs

# Convert to list of dictionaries
section_documents_dicts = []
for doc in section_documents:
    doc_dict = {
        'metadata': doc.metadata,
        'page_content': doc.page_content
    }
    section_documents_dicts.append(doc_dict)

# Print the result
print(section_documents_dicts)


[{'metadata': {'source': 'https://arxiv.org/pdf/1910.13461.pdf', 'section_number': 0, 'section_title': '2.1      Architecture'}, 'page_content': '2.1      Architecture\nture  \n \n  BART      uses      the      standard      sequence-to-sequence      Trans-  \n \n  former      architecture      from      (Vaswani      et      al.,      2017),      ex-  \n \n  cept,      following      GPT,      that      we      modify      ReLU      activa-  \n \n  tion      functions      to      GeLUs      (Hendrycks   \n \n&      Gimpel,      2016)  \n \n  and      initialise      parameters      from      A/(0,0.02).\nFor      our  \n \n  base      model,      we      use   \n \n6      layers      in      the      encoder      and      de-  \n \n  coder,      and      for      our      large      model      we      use      12      layers      in  \n \n  each.\nThe      architecture      is      closely      related      to      that      used      in  \n \n  BERT,      with      the      followin

In [16]:
len(section_documents_dicts)

20

In [17]:
section_documents_dicts


[{'metadata': {'source': 'https://arxiv.org/pdf/1910.13461.pdf',
   'section_number': 0,
   'section_title': '2.1      Architecture'},
  'page_content': '2.1      Architecture\nture  \n \n  BART      uses      the      standard      sequence-to-sequence      Trans-  \n \n  former      architecture      from      (Vaswani      et      al.,      2017),      ex-  \n \n  cept,      following      GPT,      that      we      modify      ReLU      activa-  \n \n  tion      functions      to      GeLUs      (Hendrycks   \n \n&      Gimpel,      2016)  \n \n  and      initialise      parameters      from      A/(0,0.02).\nFor      our  \n \n  base      model,      we      use   \n \n6      layers      in      the      encoder      and      de-  \n \n  coder,      and      for      our      large      model      we      use      12      layers      in  \n \n  each.\nThe      architecture      is      closely      related      to      that      used      in  \n \n  BERT,      with      the      

In [29]:
import json
from typing import List, Dict
import re

class IDGenerator:
    def __init__(self):
        self.current_id = 0

    def get_next_id(self):
        self.current_id += 1
        return self.current_id

id_generator = IDGenerator()

def create_node(title, node_type, content=None):
    return {
        "id": id_generator.get_next_id(),
        "title": title,
        "type": node_type,
        "children": [],
        "content": content if content else ""
    }

def chunk_content(content: str, max_chunk_size: int = 1000) -> List[str]:
    chunks = []
    current_chunk = ""
    sentences = re.split(r'(?<=[.!?])\s+', content)
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk += " " + sentence
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def convert_to_textbook_structure(books: List[List[Dict]], max_chunk_size: int = 1000):
    library = create_node("Library", "library")

    for book_index, book_documents in enumerate(books):
        book = create_node(f"Book {book_index + 1}", "book")
        library["children"].append(book)

        # Sort documents by section number
        sorted_documents = sorted(book_documents, key=lambda x: x['metadata']['section_number'])

        for doc in sorted_documents:
            section_title = doc['metadata']['section_title']
            content = doc['page_content']

            # Create a new section node
            new_section = create_node(section_title, "section")

            # Chunk the content
            content_chunks = chunk_content(content, max_chunk_size)

            # Create chunk nodes
            for i, chunk in enumerate(content_chunks):
                chunk_node = create_node(f"{section_title} - Chunk {i+1}", "chunk")
                chunk_node["content"] = chunk
                new_section["children"].append(chunk_node)

            # Add the new section to the book
            book["children"].append(new_section)

    return library

def save_structure_to_json(structure, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(structure, f, ensure_ascii=False, indent=2)

def load_structure_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def print_structure(node, indent=0):
    print("  " * indent + f"[ID: {node['id']}] {node['type']}: {node['title']}")
    if node['content']:
        print("  " * (indent + 1) + f"Content: {node['content'][:50]}...")
    for child in node['children']:
        print_structure(child, indent + 1)

# Example usage




# List of books
books = [section_documents_dicts, section_documents_dicts_2]

# Convert to library structure
library_structure = convert_to_textbook_structure(books, max_chunk_size=500)

# Save the structure to a JSON file
json_file_path = "library_structure.json"
save_structure_to_json(library_structure, json_file_path)

print(f"Library structure saved to {json_file_path}")

# Load the structure from the JSON file
loaded_structure = load_structure_from_json(json_file_path)

print("\nLoaded structure:")
print_structure(loaded_structure)

Library structure saved to library_structure.json

Loaded structure:
[ID: 1] library: Library
  [ID: 2] book: Book 1
    [ID: 3] section: 2.1      Architecture
      [ID: 4] chunk: 2.1      Architecture - Chunk 1
        Content: 2.1      Architecture
ture  
 
  BART      uses   ...
      [ID: 5] chunk: 2.1      Architecture - Chunk 2
        Content: For      our  
 
  base      model,      we      u...
      [ID: 6] chunk: 2.1      Architecture - Chunk 3
        Content: The      architecture      is      closely      re...
      [ID: 7] chunk: 2.1      Architecture - Chunk 4
        Content: In      total,      BART      con-  
 
  tains    ...
    [ID: 8] section: 2.2      Pre-training      BART
      [ID: 9] chunk: 2.2      Pre-training      BART - Chunk 1
        Content: 2.2      Pre-training      BART
BART  
 
  BART   ...
      [ID: 10] chunk: 2.2      Pre-training      BART - Chunk 2
        Content: Unlike      existing      denoising      autoencod...
      [ID: 11] chunk: 

In [2]:
from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader
loader = LLMSherpaFileLoader(
    file_path="https://arxiv.org/pdf/2402.14207.pdf",
    new_indent_parser=True,
    apply_ocr=True,
    strategy="sections",
    llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all",
)
section_documents_2 = loader.load()

In [6]:
for doc in section_documents_2:
    print(doc.metadata)

{'source': 'https://arxiv.org/pdf/2402.14207.pdf', 'section_number': 0, 'section_title': 'Abstract'}
{'source': 'https://arxiv.org/pdf/2402.14207.pdf', 'section_number': 1, 'section_title': 'Domain      Scope      Given      Given       P      Outline?      Refs?'}
{'source': 'https://arxiv.org/pdf/2402.14207.pdf', 'section_number': 2, 'section_title': '2.1      The      FreshWiki      Dataset'}
{'source': 'https://arxiv.org/pdf/2402.14207.pdf', 'section_number': 3, 'section_title': '2.2      Outline      Creation      and      Evaluation'}
{'source': 'https://arxiv.org/pdf/2402.14207.pdf', 'section_number': 4, 'section_title': 'Topic      t       @      Identify       Perspectives       @      Survey       —      Writer       =o       —       Related      Articles      |'}
{'source': 'https://arxiv.org/pdf/2402.14207.pdf', 'section_number': 5, 'section_title': 'References      R'}
{'source': 'https://arxiv.org/pdf/2402.14207.pdf', 'section_number': 6, 'section_title': '3.1      Perspe

In [24]:
class Document:
    def __init__(self, metadata, page_content):
        self.metadata = metadata
        self.page_content = page_content

# List of Document objects
section_documents =section_documents_2

# Convert to list of dictionaries
section_documents_dicts_2 = []
for doc in section_documents:
    doc_dict = {
        'metadata': doc.metadata,
        'page_content': doc.page_content
    }
    section_documents_dicts_2.append(doc_dict)

# Print the result
print(section_documents_dicts_2)


[{'metadata': {'source': 'https://arxiv.org/pdf/2402.14207.pdf', 'section_number': 0, 'section_title': 'Abstract'}, 'page_content': 'Abstract\n \n \n  We      study      how      to      apply      large      language      models  \n \n  to      write      grounded      and      organized      long-form      ar-  \n \n  ticles      from      scratch,      with      comparable      breadth  \n \n  and      depth      to      Wikipedia      pages.\nThis      underex-  \n \n  plored      problem      poses      new      challenges      at      the  \n \n  pre-writing      stage,      including      how      to      research  \n \n  the      topic      and      prepare      an      outline      prior      to      writ-  \n \n  ing.\nWe      propose      STORM,   \n \na      writing      system  \n \n  for      the      Synthesis      of      Topic      Outlines      through  \n \n  Retrieval      and      Multi-perspective      Question      Ask-  \n \n  ing.\nSTORM      models      the   

In [None]:
file_links=[
    'https://arxiv.org/pdf/2407.14562',
    'https://arxiv.org/pdf/2407.14743',
    'https://arxiv.org/pdf/2407.14662',
    'https://arxiv.org/pdf/2407.15259',
    'https://arxiv.org/pdf/2407.15527',
    'https://arxiv.org/pdf/2407.12873',
    'https://arxiv.org/pdf/2407.14525',
    'https://arxiv.org/pdf/2407.14565',
    'https://arxiv.org/pdf/2407.14568',
    'https://arxiv.org/pdf/2407.14622',
    'https://arxiv.org/pdf/2407.14631',
    'https://arxiv.org/pdf/2407.14651',
    'https://arxiv.org/pdf/2407.14658',
    'https://arxiv.org/pdf/2407.14681',
    'https://arxiv.org/pdf/2407.14717',
    'https://arxiv.org/pdf/2407.14725',
    'https://arxiv.org/pdf/2407.14735',
    'https://arxiv.org/pdf/2407.14738',
    'https://arxiv.org/pdf/2407.14741',
    'https://arxiv.org/pdf/2407.14743',
    'https://arxiv.org/pdf/2407.14765'

]

In [5]:
from llmsherpa.readers import LayoutPDFReader

llmsherpa_api_url = "http://localhost:5010/api/parseDocument?renderFormat=all"
pdf_url = "/home/ubuntu/T_RAG/testing/1910.13461v1.pdf" 
pdf_reader = LayoutPDFReader(llmsherpa_api_url)
doc = pdf_reader.read_pdf(pdf_url)

KeyError: 'return_dict'

In [1]:
from sentence_transformers import SentenceTransformer

In [2]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding = embedding_model.encode("hwo to dance").tolist()

In [4]:
len(embedding)

384