In [12]:
import re
from langchain.document_loaders import PyPDFLoader
from transformers import GPT2TokenizerFast
import json

# Initialize tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

# List of all PDF files with their part numbers in the name
pdf_files_with_parts = {
    'data/11.pdf': ['11'],
    'data/200.pdf': ['200'],
    'data/201.pdf': ['201'],
    'data/202.pdf': ['202'],
    'data/203.pdf': ['203'],
    'data/205.pdf': ['205'],
    'data/206.pdf': ['206'],
    'data/207.pdf': ['207'],
    'data/208.pdf': ['208'],
    'data/209.pdf': ['209'],
    'data/210.pdf': ['210'],
    'data/211.pdf': ['211'],
    'data/212.pdf': ['212'],
    'data/216.pdf': ['216'],
    'data/225.pdf': ['225'],
    'data/226.pdf': ['226'],
    'data/250.pdf': ['250'],
    'data/251.pdf': ['251'],
    'data/290.pdf': ['290'],
    'data/299.pdf': ['299'],
    'data/312.pdf': ['312'],
    'data/314.pdf': ['314'],
    'data/600.pdf': ['600'],
    'data/601.pdf': ['601'],
    'data/606.pdf': ['606'],
    'data/607.pdf': ['607'],
    'data/610.pdf': ['610'],
    'data/630.pdf': ['630'],
    'data/640.pdf': ['640'],
    'data/660.pdf': ['660'],
    'data/680.pdf': ['680'],
    'data/820.pdf': ['820']
}

# Regex patterns to detect part titles, subparts, and sections
part_pattern = re.compile(r"PART\s+(\d+)\s*—\s*(.*)", re.IGNORECASE)
subpart_pattern = re.compile(r"Subpart\s+([A-Z])\s*—\s*(.*)", re.IGNORECASE)
section_pattern = re.compile(r"^\s*§\s*\d+\.\d+.*", re.IGNORECASE | re.MULTILINE)

# Function to extract the correct part and its title based on the part number
def extract_correct_part_and_title(text, part_number):
    part_text = []
    part_title = None
    lines = text.split("\n")
    in_part = False

    for line in lines:
        match = part_pattern.match(line.strip())
        if match:
            current_part = match.group(1)
            current_title = match.group(2)
            if current_part == part_number:
                part_title = current_title.strip()
                i = lines.index(line) + 1
                while i < len(lines) and not lines[i].strip().startswith("Subpart") and not lines[i].strip().startswith("§"):
                    part_title += " " + lines[i].strip()
                    i += 1
                in_part = True
            else:
                if in_part:
                    break
        
        if in_part:
            part_text.append(line)
    
    if part_title and part_text:
        return part_title, "\n".join(part_text)
    else:
        return None, None

# Function to tokenize and chunk by token limit with overlap
def chunk_text_by_token_limit(text, chunk_size=2000, overlap=100):
    tokens = tokenizer.encode(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk_tokens = tokens[i:i + chunk_size]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)
    return chunks

# Function to extract part, subpart, and section titles and chunk based on section headers
def chunk_sections_with_titles(full_text, chunk_size=2000, overlap=100):
    sections = {}
    current_section = None
    current_section_lines = []
    current_part_title = None
    current_subpart_title = None

    lines = full_text.split("\n")

    for idx, line in enumerate(lines):
        part_match = part_pattern.match(line.strip())
        if part_match:
            current_part_title = extract_correct_part_and_title(full_text, part_match.group(1))[0]
            continue

        subpart_match = subpart_pattern.match(line.strip())
        if subpart_match:
            current_subpart_title = subpart_match.group(2).strip()
            continue

        section_match = section_pattern.match(line.strip())
        if section_match:
            if current_section:
                section_text = "\n".join(current_section_lines)
                if len(tokenizer.encode(section_text)) > chunk_size:
                    sections[current_section] = {
                        "chunks": chunk_text_by_token_limit(section_text, chunk_size, overlap),
                        "part_title": current_part_title,
                        "subpart_title": current_subpart_title
                    }
                else:
                    sections[current_section] = {
                        "chunks": [section_text],
                        "part_title": current_part_title,
                        "subpart_title": current_subpart_title
                    }

            current_section = line.strip()
            current_section_lines = [current_section]
        else:
            if current_section:
                current_section_lines.append(line)

    if current_section:
        section_text = "\n".join(current_section_lines)
        if len(tokenizer.encode(section_text)) > chunk_size:
            sections[current_section] = {
                "chunks": chunk_text_by_token_limit(section_text, chunk_size, overlap),
                "part_title": current_part_title,
                "subpart_title": current_subpart_title
            }
        else:
            sections[current_section] = {
                "chunks": [section_text],
                "part_title": current_part_title,
                "subpart_title": current_subpart_title
            }

    return sections

# Load all PDFs and extract the relevant parts, subparts, and sections with token limits
all_extracted_parts = {}

for file_path, part_numbers in pdf_files_with_parts.items():
    loader = PyPDFLoader(file_path)
    pdf_documents = loader.load()
    full_text = "\n".join([doc.page_content for doc in pdf_documents])

    for part_number in part_numbers:
        part_title, correct_part_text = extract_correct_part_and_title(full_text, part_number)
        if not part_title or not correct_part_text:
            continue

        chunked_sections = chunk_sections_with_titles(correct_part_text)
        all_extracted_parts[file_path] = {
            "part_title": part_title,
            "sections": chunked_sections
        }

# Save document metadata (without embeddings yet)
documents = []
for file_path, part_info in all_extracted_parts.items():
    part_title = part_info["part_title"]
    for section, section_info in part_info["sections"].items():
        subpart_title = section_info["subpart_title"]
        for chunk in section_info["chunks"]:
            doc = {
                "page_content": chunk,
                "metadata": {
                    "file_path": file_path,
                    "part_title": part_title,
                    "subpart_title": subpart_title,
                    "section": section
                }
            }
            documents.append(doc)

# Save the metadata for all documents
with open('document_metadata.json', 'w') as f:
    json.dump(documents, f)


Token indices sequence length is longer than the specified maximum sequence length for this model (1209 > 1024). Running this sequence through the model will result in indexing errors


In [14]:
# documents

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings
import faiss
import numpy as np
import json

# Initialize HuggingFace embedding model
embedding_model = HuggingFaceEmbeddings()

# Load the document metadata (without embeddings)
with open('document_metadata.json', 'r') as f:
    documents = json.load(f)

# Generate embeddings for each document chunk
for doc in documents:
    embedding = embedding_model.embed_query(doc['page_content'])
    doc['metadata']['embedding'] = embedding

# Create FAISS index
dimension = len(documents[0]['metadata']['embedding'])
index = faiss.IndexFlatL2(dimension)

# Add embeddings to FAISS index
embeddings = [doc['metadata']['embedding'] for doc in documents]
embedding_matrix = np.array(embeddings).astype('float32')
index.add(embedding_matrix)

# Save FAISS index to disk
faiss.write_index(index, "faiss_index.index")

# Save updated metadata with embeddings
with open('document_metadata_with_embeddings.json', 'w') as f:
    json.dump(documents, f)


  embedding_model = HuggingFaceEmbeddings()
