In [12]:
from langchain.docstore.document import Document
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)

    docs = []
    for page_num in range(doc.page_count):
        page = doc[page_num]
        docs.append(Document(page_content=page.get_text(), metadata={'page': page_num, 'source': pdf_path}))
    doc.close()

    return docs

def process_pdfs_in_folder(folder_path):
    all_docs = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        print(file_path)
        if os.path.isdir(file_path):
            # If it's a folder, call the function recursively
            process_pdfs_in_folder(file_path)
        elif filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            docs = extract_text_from_pdf(pdf_path)
            all_docs.append(docs)
    return all_docs

In [13]:
# Set your folder path, chunk size, and overlap
folder_path = "sample"

loaded_documents = process_pdfs_in_folder(folder_path)
# convert list of lists to a single list
loaded_documents = [item for sublist in loaded_documents for item in sublist]

print(f"Length of loaded pages: {len(loaded_documents)}")

sample\AFM_annualreport_2022.pdf
sample\mckinsey-tech-trends-outlook-2022-full-report.pdf
sample\mgi-reinventing-construction-a-route-to-higher-productivity-full-report.pdf
sample\Procter&Gamble_annualreport_2023.pdf
sample\the-state-of-organizations-2023.pdf
Length of loaded pages: 724


In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split docs in chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=64
)

texts = text_splitter.split_documents(loaded_documents)
len(texts)

4367

In [16]:
from tqdm import tqdm

full_df = []

chunk_count = 0
curr_doc = None
document = []
for idx, nlp_text in tqdm(enumerate(texts), total=len(texts), desc="Structure data"):  
    if curr_doc != nlp_text.metadata['source']:
        chunk_count = 0
        curr_doc = nlp_text.metadata['source']
        if idx > 0:
            full_df.append(document)
            document = []
    else:
        chunk_count += 1

    document.append({'content': nlp_text.page_content,
                     'metadata': {
                         'source': nlp_text.metadata['source'],
                         'page': nlp_text.metadata['page'],
                         'chunk': chunk_count
                     }
                    })
full_df.append(document)


Structure data: 100%|██████████| 4367/4367 [00:00<00:00, 1456466.73it/s]


# NER

In [25]:
import re

def clean_text(text):
    """Doc cleaning"""

    # Lowering text
    text = text.lower()

    PUNCTUATION = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""

    # Removing punctuation
    text = "".join([c for c in text if c not in PUNCTUATION])

    # Removing whitespace and newlines
    text = re.sub('\s+',' ',text)

    # Removing numbers
    pattern = r'[0-9]'
    text = re.sub(pattern, '', text)

    return text

In [17]:
import spacy
from spacy import displacy

In [18]:
spacy.cli.download("en_core_web_sm")
NER = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [19]:
def spacy_large_ner(document):
  return {(ent.text.strip(), ent.label_) for ent in NER(document).ents}

In [28]:
num_docs = len(full_df)
num_nodes = 100

In [31]:
from tqdm.notebook import tqdm

document_bar = tqdm(total=len(full_df), desc="Creating nodes per document")

for doc_i, document in enumerate(full_df):
    source = document[0]['metadata']['source']
    chunk_bar = tqdm(total=len(document), desc=("Per chunk for "+source))

    for chunk_i, chunk in enumerate(document):
        chunk['ner'] = spacy_large_ner(clean_text(chunk['content']))
        chunk_bar.update(1)
        if chunk_i >= num_nodes:
                break
    if doc_i >= num_docs:
        break

    chunk_bar.close()
    document_bar.update(1)
        
document_bar.close()

Creating nodes per document:   0%|          | 0/5 [00:00<?, ?it/s]

Per chunk for sample\AFM_annualreport_2022.pdf:   0%|          | 0/1303 [00:00<?, ?it/s]

Per chunk for sample\mckinsey-tech-trends-outlook-2022-full-report.pdf:   0%|          | 0/643 [00:00<?, ?it/s…

Per chunk for sample\mgi-reinventing-construction-a-route-to-higher-productivity-full-report.pdf:   0%|       …

Per chunk for sample\Procter&Gamble_annualreport_2023.pdf:   0%|          | 0/802 [00:00<?, ?it/s]

Per chunk for sample\the-state-of-organizations-2023.pdf:   0%|          | 0/540 [00:00<?, ?it/s]

In [32]:
for document in full_df:
    for chunk in document:
        print(chunk['metadata']['source'], chunk['metadata']['page'], chunk['metadata']['chunk'], sep=": ")
        print(chunk['content'], end="\n\n")
        print(list(chunk['ner']), end="\n\n")

sample\AFM_annualreport_2022.pdf: 0: 0
Annual Report  
& Accounts 2022
The power
of our people

[('annual', 'DATE')]

sample\AFM_annualreport_2022.pdf: 1: 1
Welcome to Alpha’s 2022
Annual Report & Accounts
Alpha is a leading global consultancy to the asset 
management, wealth management and insurance industries.
Perspective  |  Strategy  |  Technical Expertise  |  Data Solutions
Headquartered in the UK and quoted on the AIM of the London Stock Exchange, Alpha Financial 
Markets Consulting1 is a leading global provider of specialist consultancy services to the asset 
management, wealth management and insurance industries.

[('annual', 'DATE'), ('uk', 'GPE'), ('the london stock exchange', 'ORG')]

sample\AFM_annualreport_2022.pdf: 1: 2
management, wealth management and insurance industries. 
Alpha has worked with all of the world’s top 20 and 76% of the world’s top 50 asset managers by AUM, 
along with a wide range of other buy-side firms. It has the largest dedicated team in the industr