<a href="https://colab.research.google.com/github/IyadSultan/Basic_Flask_site/blob/master/educational/Named_Entity_Extraction/Named_Entity_Extraction_spaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Extract any named entities from PDF using custom Spacy pipeline**


https://medium.com/@knowledgrator/extract-any-named-entities-from-pdf-using-custom-spacy-pipeline-9fd0af2c3e13

In [1]:
# !pip install pdfminer.six spacy
# !python -m spacy download en_core_web_sm

In [2]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from tqdm import tqdm
import re

In [3]:
def process_document(pdf_path, page_ids=None):
   extracted_pages = extract_pages(pdf_path, page_numbers=page_ids)


   page2content = {}


   # Process each extracted page
   for extracted_page in tqdm(extracted_pages):
       page_id = extracted_page.pageid
       content = process_page(extracted_page)
       page2content[page_id] = content


   return page2content

In [4]:
def process_page(extracted_page):
   content = []


   # Get a sorted list of elements based on
                # their Y-coordinate in reverse order
   elements = [element for element in extracted_page._objs]
   elements.sort(key=lambda a: a.y1, reverse=True)


   for i, element in enumerate(elements):
       # Extract text if the element is a text container
       # and text extraction is enabled
       if isinstance(element, LTTextContainer):
           line_text = extract_text_and_normalize(element)
           content.append(line_text)

   # Combine and clean up the extracted content
   content = re.sub('\n+', '\n', ''.join(content))
   return content

In [5]:
def extract_text_and_normalize(element):
   # Extract text from line and split it with new lines
   line_texts = element.get_text().split('\n')
   norm_text = ''
   for line_text in line_texts:
       line_text=line_text.strip()
       # empty strings after striping convert to newline character
       if not line_text:
           line_text = '\n'
       else:
           line_text = re.sub('\s+', ' ', line_text)
           # if the last character is not a letter or number,
                                # add newline character to a line
           if not re.search('[\w\d\,\-]', line_text[-1]):
               line_text+='\n'
           else:
               line_text+=' '
       # concatenate into single string
       norm_text+=line_text
   return norm_text

In [6]:
!curl "https://s28.q4cdn.com/781576035/files/doc_financials/2022/ar/PFE-2022-Form-10K-FINAL-(without-Exhibits).pdf" > pfizer-report.pdf

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1565k  100 1565k    0     0  1638k      0 --:--:-- --:--:-- --:--:-- 1637k


In [7]:
pdf_path = 'pfizer-report.pdf'
page2content = process_document(pdf_path, page_ids=[9])

1it [00:00,  1.66it/s]


In [8]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

In [9]:
from spacy import displacy

text = page2content[1]
doc = nlp(text)
displacy.serve(doc, style="ent")




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


# Extract IRB number from all PDFs in a folder

In [11]:
# !pip install PyMuPDF


import os
import spacy
import fitz  # PyMuPDF
from spacy.matcher import Matcher

def extract_text_from_pdf(pdf_path):
    """Extract text from a single PDF file."""
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def extract_irb_numbers(text, nlp):
    """Extract IRB numbers from the given text using spaCy."""
    matcher = Matcher(nlp.vocab)
    pattern = [{"TEXT": {"REGEX": "([0-9]{2}KHCC[0-9]{2})"}}]
    matcher.add("IRB_NUMBER", [pattern])

    doc = nlp(text)
    matches = matcher(doc)

    irb_numbers = [doc[start:end].text for match_id, start, end in matches]
    return irb_numbers

def process_pdfs_in_folder(folder_path):
    """Process all PDF files in the given folder."""
    nlp = spacy.load("en_core_web_sm")
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)
            irb_numbers = extract_irb_numbers(text, nlp)
            print(f"IRB Numbers in {filename}: {irb_numbers}")

# Example usage
folder_path = "/content/"
process_pdfs_in_folder(folder_path)



IRB Numbers in pfizer-report.pdf: []
IRB Numbers in sample.pdf: ['18KHCC02']
