In [None]:
import re
import matplotlib.pyplot as plt
import pickle
import os

#  Scraping
from pdfminer.high_level import extract_text
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams, LTTextBox
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from io import StringIO

import pandas as pd


import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import PyPDF2
import re


# Langchain
from langchain.vectorstores import Chroma
from langchain.document_loaders import WebBaseLoader
from langchain.document_loaders import PyPDFLoader
from langchain.retrievers import SVMRetriever
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.memory import ConversationBufferMemory
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain
from langchain import PromptTemplate, LLMChain


from langchain.chains import ConversationalRetrievalChain
from langchain.chains import RetrievalQAWithSourcesChain

from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter



from langchain.embeddings import SentenceTransformerEmbeddings

# Sentence Transformers
from sentence_transformers import SentenceTransformer

# Chroma 
import chromadb 
from chromadb.utils import embedding_functions
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

#Model 
from langchain.llms import GPT4All

# Sentence Transformers
from sentence_transformers import SentenceTransformer

## Extract file's metadata

In [None]:
def extract_doc_title(pdf_path):
    title_parts = pdf_path.split("/")
    title = title_parts[-1].replace(".pdf", "")
    return title


## Extract table of contents

In [None]:
def find_toc_pages(pdf_path):
    toc_start_page = None
    toc_end_page = None

    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Initialize variables to keep track of line number and TOC detection
        line_number = 0
        toc_started = False
        
        for page_number in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_number]
            page_text = page.extract_text()
            
            toc_page = False
            
            # Split the page text into lines
            lines = page_text.split('\n')

            for line in lines:
                line_number += 1
                
                # Use a case-insensitive regular expression pattern to identify the TOC
                toc_pattern = r'\s*table\s+of\s+contents\s*'
                
                if re.search(toc_pattern, line, re.IGNORECASE):
                    if toc_start_page is None:
                        toc_start_page = page_number + 1  # Add 1 because page numbers are 1-based
                    toc_started = True

                # If TOC has started and a chapter/section is detected, mark it as the end
                if toc_started:
                    #toc_end_pattern = r'^(\d+(\.\d+)*)\s+(.*?)\s+(\d+)\s*$'
                    toc_end_pattern = r'^(\d+(\.\d+)*(\.)*)\s+(.*?)\s+(\d+)(\-\d+)*\s*$'
                    
                    if re.search(toc_end_pattern, line, re.IGNORECASE):
                        toc_page = True
                        break
                        
            if toc_started and not toc_page:
                toc_end_page = page_number  # Page where TOC ends
                break

            # Exit the loop if both start and end pages are found
            if toc_start_page is not None and toc_end_page is not None:
                break

    return toc_start_page, toc_end_page


In [None]:
def extract_table_of_contents(pdf_path):
    toc_entries = []

    toc_start_page, toc_end_page = find_toc_pages(pdf_path)
    if(toc_start_page == None):
        return None
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        for page_number in range(toc_start_page - 1, toc_end_page):
            page = pdf_reader.pages[page_number]
            page_text = page.extract_text()

            # Split the page text into lines and process TOC entries
            lines = page_text.split('\n')

            for line in lines:

                if line.strip():
                    #toc_match = re.match(r'^(\d+(\.\d+)*)\s+(.*?)\s+(\d+)\s*$', line)
                    toc_match = re.match(r'^(\d+(\.\d+)*(\.)*)\s+(.*?)\s+(\d+)(\-\d+)*\s*$', line)
                    
                    if toc_match:
                        page_number = toc_match.group(5)
                        title_number = toc_match.group(1)
                        title = toc_match.group(4).replace(".", "").rstrip()
                        toc_entries.append((title_number, title, page_number))

    return toc_entries

In [None]:
class TreeNode:
    def __init__(self, data, title=None, parent=None, page = None):
        self.data = data
        self.title = title
        self.children = []
        self.parent = parent
        self.page = page
    
    def add_parent(parent):
        self.parent = parent

def find_node_by_number(root, target_data):
    if root.data == target_data:
        return root

    for child in root.children:
        node = find_node_by_number(child, target_data)
        if node:
            return node
    return None

def build_tree(toc_list):
    root = TreeNode("Root")
    
    for entry in toc_list:
        level_str = str(entry[0])
        title = entry[1]
        page = entry[2]
        
        # Split the level string by '.' to determine the hierarchy
        pos_separator = level_str.rfind('.')
        if(pos_separator > -1):
            parent_level = level_str[0: pos_separator]
            parent_node = find_node_by_number(root, parent_level)
            if(parent_node is None):
                positions = [i for i, _ in enumerate(level_str) if level_str.startswith(".", i)]
                for i in range(len(positions)):
                    current_level = level_str[0:positions[i]]
                    current_node = find_node_by_number(root, current_level)
                    if(current_node is None):
                        if(i == 0):
                            node = TreeNode(current_level, parent = root, title = "", page = page)
                            root.children.append(node)
                        else:
                            parent_level = level_str[0:positions[i-1]]
                            parent_node = find_node_by_number(root, parent_level)
                            node =  TreeNode(current_level, parent = parent_node, title = "", page = page)
                            parent_node.children.append(node)
                parent_level = level_str[0: pos_separator]
                parent_node = find_node_by_number(root, parent_level)
                node =  TreeNode(level_str, parent = parent_node, title = title, page = page)
                parent_node.children.append(node)
            else:
                node =  TreeNode(level_str, parent = parent_node, title = title, page = page)
                parent_node.children.append(node)
        else:
            parent_level = 0
            parent = root
            node = TreeNode(level_str, parent = root, title = title, page = page)
            root.children.append(node)
    return root


In [None]:
def find_all_titles(toc_tree, number_chapter):

    node = find_node_by_number(toc_tree, str(number_chapter))
    if(node == None):
        title_list = [number_chapter]
    else:
        titles_list = [node.data + " " + node.title]
    
    while node.parent.data != "Root":
        titles_list.append(node.parent.data + " " + node.parent.title)
        node = node.parent
    titles = ""
    for i in range(len(titles_list) -1, -1, -1):
        titles = titles + "/" + titles_list[i]
    return titles

#list_node = find_all_titles(toc_tree, 3.2)
#list_node

## Extract paragraphs

In [None]:
def extract_text_between_titles(pdf_path, toc_entries, header_len = 4):
    text_list = []
    pdf_text = []
    
    toc_start_page, toc_end_page = find_toc_pages(pdf_path)
    start_page = toc_end_page + 1
    doc_title = extract_doc_title(pdf_path)
    
    toc_tree = build_tree(toc_entries)
    
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        for page_number in range(start_page, len(pdf_reader.pages)):
            # Get all pages
            page = pdf_reader.pages[page_number]
            page_text = page.extract_text()
            pdf_text.append(page_text)

    for i, toc_entry in enumerate(toc_entries):
        title_number, title, page_number_toc = toc_entry
        
        #Get next title name and number
        if i < len(toc_entries)-1:
            title_number_next, title_next, page_number_toc_next = toc_entries[i+1]
        else:
            title_number_next = None
            title_next = None

        next_title_found = False
        current_title_found = False
        current_text = []

        for page_text in pdf_text:
            # Split the page text into lines
            lines = page_text.split('\n')
            line_number = 0

            for line in lines:
                
                line_number+=1
                if line_number <= header_len:
                    #ignore headers
                    continue
                if line_number == len(lines) and re.match(r'^\s+(\d+)\s+$', line):
                    #Ignore page_number
                    continue
                    
                # Check if the line contains the current title
                if (title.lower().replace(" ", "") in line.lower().replace(" ", "") and (title_number + " ") in line):
                    # Start a new section for the current title
                    current_title_found = True
                    current_text = []
                else:
                    if(title_number_next is not None):
                        # Check if the next title is found
                        if (title_next.lower().replace(" ", "") in line.lower().replace(" ", "") and (title_number_next + " ") in line):
                            next_title_found = True

                            # Append the section if a title was found
                            if current_title_found:
                                titles = find_all_titles(toc_tree, title_number)
                                text_list.append({
                                    'Title Number': title_number,
                                    'Title': title,
                                    'Page': page_number_toc,
                                    'Text': '\n'.join(current_text),
                                    'Source': doc_title + titles,
                                    'Documentation': doc_title
                                })
                            current_title_found = False
                            # Breaks the loop as soon as a new title is found
                            break
                        elif current_title_found:
                            # Append the line to the current text if it's not a title
                            current_text.append(line)
                    else:
                        # Append the line to current text
                        current_text.append(line)
                        
                if next_title_found:
                     # Breaks the loop as soon as a new title is found
                    break
                    
        if(title_number_next is None):
            # Add the current text for the last title
            titles = find_all_titles(toc_tree, title_number)
            text_list.append({
                'Title Number': title_number,
                'Title': title,
                'Page': page_number_toc,
                'Text': '\n'.join(current_text),
                'Source': doc_title + titles,
                'Documentation': doc_title
                })

    return text_list

## Clean Data

In [None]:
def clean_text(text):
    """
    Cleans the provided text by:
    - Removing HTML tags and content
    - Removing Markdown-specific syntax
    - Converting Unicode characters to their actual representation
    - Removing URLs
    - Removing extra white spaces
    
    Args:
    - text (str): The input string to be cleaned.
    
    Returns:
    - str: The cleaned string.
    """
    
    # 1. Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")
    no_html = soup.get_text(separator=' ')
    
    # 2. Remove Markdown Syntax
    no_markdown = re.sub(r'\!\[.*?\]\(.*?\)|\[(.*?)\]\(.*?\)|\*\*.*?\*\*|\*.*?\*|#[^\n]*', '', no_html)
    
    # 3. Convert Unicode characters (for common entities; can be expanded further)
    no_unicode = re.sub(r'&amp;', '&', no_markdown)
    no_unicode = re.sub(r'&lt;', '<', no_unicode)
    no_unicode = re.sub(r'&gt;', '>', no_unicode)
    
    # 4. Remove URLs
    no_urls = re.sub(r'http[s]?://\S+', '', no_unicode)
    
    # 5. Remove extra white spaces
    clean_string = ' '.join(no_urls.split())
    
    return clean_string

def clean_documents(documents):
    for doc in documents:
        doc.page_content = clean_text(doc.page_content)
    return documents

def clean_text_entries(text_entries):
    for text_entry in text_entries:
        text_entry['Text'] = clean_text(text_entry['Text'])

## Add several PDF

In [None]:
def get_pdf_files(folder_path):
    pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')]
    return pdf_files

folder_path = '../docs/EPOCH Docs/'

pdf_list = get_pdf_files(folder_path)

# Print the list of PDF files
print("List of PDF files:")
for pdf_file in pdf_list:
    print(pdf_file)


In [None]:
df_final = pd.DataFrame()
for pdf_path in pdf_list:
    print(pdf_path)
    toc_entries = extract_table_of_contents(folder_path + "/" + pdf_path)
    if toc_entries == None:
        print("This PDF doesn't have a TOC")
        continue
    text_entries = extract_text_between_titles(folder_path + "/" + pdf_path, toc_entries)
    clean_text_entries(text_entries)
    if df_final.empty:
        df_final = pd.DataFrame(text_entries)
    else:
        df = pd.DataFrame(text_entries)
        df_final = pd.concat([df_final, df], axis=0)
df_final

In [None]:
df_final['word_count'] = df_final['Text'].apply(lambda x: len(x.split()))

## Store in Chroma

In [None]:
chroma_client = client = chromadb.PersistentClient(path='C:/Users/Nathan/Kratos_data-Science/Chroma/v7')

In [None]:
client.delete_collection(name="EPOCH")

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2',  device='cpu')

In [None]:
# Initialize Chroma
vectorstore = chroma_client.get_or_create_collection(name="EPOCH")


In [None]:
# Lists to store the extracted information from documents
documents_list = []
embeddings_list = []
metadatas_list = []
ids_list = []

# Assuming 'content' in your dataframe is what you consider as the document/page_content
for _, row in df_final.iterrows():
    embedding = model.encode(row['Text'])
    
    # Constructing metadata
    metadata = {
        "source": f"{row['Source']}",
        "title_number": f"{row['Title Number']}",
        "title": f"{row['Title']}",
        "page_number": f"{row['Page']}",
        "word_count": row['word_count'],
        "documentation": f"{row['Documentation']}",
        "file_path": "path"
    }

    documents_list.append(row['Text'])
    embeddings_list.append(embedding.tolist())
    metadatas_list.append(metadata)

# Generating IDs for the documents
ids_list = ["v" + str(i + 1) for i in range(len(documents_list))]

# Add the embedded documents to the collection in Chroma
vectorstore.add(
    documents=documents_list,
    embeddings=embeddings_list,
    metadatas=metadatas_list,
    ids=ids_list
)

In [None]:
vectorstore

In [None]:
print("There are", vectorstore.count(), "in the collection")