# Pdfplumber 

In [None]:
import pandas as pd
import os
import pdfplumber
import re

# Path to local PDF files
pdf_folder = r"C:\Users\emlu\OneDrive - Netcompany\Desktop\Virtual Tech Vestas\RAG_LLM_POC_v1\data\Vestas_RTP\Documents\Documents"

# Create Regex Patterns
dms_pattern = re.compile(r"\b\d{4}-\d{4}\b")
ver_pattern = re.compile(r"\bVER[:\s]+(\d{2})\b", re.IGNORECASE)
date_pattern = re.compile(r"Date[:\s]*([\d]{4}-[\d]{2}-[\d]{2})", re.IGNORECASE)
exported_date_pattern = re.compile(r"Exported from DMS.*?(\d{4}-\d{2}-\d{2})", re.IGNORECASE)
doc_type_pattern = re.compile(r"\bT\d{2}\b")
classification_pattern = re.compile(r"\b(Confidential|Restricted)\b(?![a-zA-Z])", re.IGNORECASE)
approved_pattern = re.compile(r"\bApproved\b", re.IGNORECASE)

# Results list
results = []

for filename in os.listdir(pdf_folder):
    if filename.lower().endswith(".pdf"):
        filepath = os.path.join(pdf_folder, filename)

        with pdfplumber.open(filepath) as pdf:
            text = pdf.pages[0].extract_text() or ""

        # Metadata extraction
        dms_match = dms_pattern.search(text)
        ver_match = ver_pattern.search(text)
        exported_match = exported_date_pattern.search(text)
        doc_type_match = doc_type_pattern.search(text)
        classification_match = classification_pattern.search(text)
        date_match = date_pattern.search(text)
        approved_match = approved_pattern.search(text)


        result = {
            "filename": filename,
            "DMS No.": dms_match.group(0) if dms_match else None,
            "Ver": ver_match.group(1) if ver_match else None,
            "Date (Doc hist)": date_match.group(1) if date_match else None,
            "Date (Exported from DMS)": exported_match.group(1) if exported_match else None,
            "Doc type": doc_type_match.group(0) if doc_type_match else None,
            "Classification": classification_match.group(1).capitalize() if classification_match else None,
            "Approved": "Y" if approved_match else None  
        }

        results.append(result)

# Convert to DataFrame
df_pdfplumber = pd.DataFrame(results)

In [None]:
#Checking Output
df_pdfplumber

# Pdfplumber new 

In [None]:
import pdfplumber
import os
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import pandas as pd
import re

pdf_folder = r"C:\Users\emlu\OneDrive - Netcompany\Desktop\Virtual Tech Vestas\RAG_LLM_POC_v1\data\Vestas_RTP\Documents\Documents"


### Step 1 extract values by means of pdfplumber ###

# Regex patterns
dms_pattern = re.compile(r"\b\d{4}-\d{4}\b")
ver_pattern = re.compile(r"\bVER[:\s]+(\d{2})\b", re.IGNORECASE)
date_pattern = re.compile(r"Date[:\s]*([\d]{4}-[\d]{2}-[\d]{2})", re.IGNORECASE)
exported_date_pattern = re.compile(r"Exported from DMS.*?(\d{4}-\d{2}-\d{2})", re.IGNORECASE)
doc_type_pattern = re.compile(r"\bT(?:\d{2}|[oO]\d)\b")
classification_pattern = re.compile(r"\b(Confidential|Restricted)\b(?![a-zA-Z])", re.IGNORECASE)
approved_pattern = re.compile(r"\bApproved\b", re.IGNORECASE)


def extract_pdf_metadata(pdf_folder: str) -> pd.DataFrame:
    
    results = []

    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith(".pdf"):
            filepath = os.path.join(pdf_folder, filename)

            with pdfplumber.open(filepath) as pdf:
                text = pdf.pages[0].extract_text() or ""

            # Metadata extraction
            dms_match = dms_pattern.search(text)
            ver_match = ver_pattern.search(text)
            exported_match = exported_date_pattern.search(text)
            doc_type_match = doc_type_pattern.search(text)
            classification_match = classification_pattern.search(text)
            date_match = date_pattern.search(text)
            approved_match = approved_pattern.search(text)

            result = {
                "FILENAME": filename,
                "DMS_ID": dms_match.group(0) if dms_match else None,
                "VERSION": ver_match.group(1) if ver_match else None,
                "DATE": date_match.group(1) if date_match else None,
                "DATE_EXPORTED)": exported_match.group(1) if exported_match else None,
                "DOC_TYPE": doc_type_match.group(0) if doc_type_match else None,
                "CONFIDENTIALITY": classification_match.group(1).capitalize() if classification_match else None,
                "APPROVED": bool(approved_pattern.search(text))
            }

            results.append(result)

    return pd.DataFrame(results)

# Create dataframe to store values extracted with pdfplumber
df_pdfplumber = extract_pdf_metadata(pdf_folder)


### Step 2 - extract values by means of OCR (tesseract) ###

def extract_first_page_images(pdf_folder: str) -> list:
    images = []
    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, filename)
            try:
                doc = fitz.open(pdf_path)
                if doc.page_count == 0:
                    continue
                page = doc.load_page(0)
                pix = page.get_pixmap(dpi=400)
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                images.append((filename, img))
                doc.close()
            except Exception as e:
                print(f"Failed to process {filename}: {e}")
    return images


def crop_right_bottom_region(image: Image.Image) -> Image.Image:
    """
    Crops 5% from the right and 50% from the bottom, then rotates the result 90 degrees clockwise.
    """
    width, height = image.size
    left = width * 0.95
    upper = height * 0.5
    right = width
    lower = height
    crop_box = (int(left), int(upper), int(right), int(lower))
    cropped = image.crop(crop_box)

    # Rotate 90 degrees clockwise
    rotated = cropped.transpose(Image.ROTATE_270)  # PIL rotates counter-clockwise, so 270 = clockwise 90 degrees

    return rotated


def extract_text_from_cropped_images(pdf_folder: str) -> pd.DataFrame:
    data = []
    pdf_images = extract_first_page_images(pdf_folder)

    for filename, image in pdf_images:
        cropped_img = crop_right_bottom_region(image)
        text = pytesseract.image_to_string(cropped_img, lang="eng")
        data.append({
            "pdf_filename": filename,
            "extracted_text": text.strip()
        })

    df = pd.DataFrame(data)
    return df

# Create dataframe to store cropped text 
df_cropped_text = extract_text_from_cropped_images(pdf_folder)


def extract_matches(text):
    doc_type_match = doc_type_pattern.search(text)
    if doc_type_match:
        doc_type = doc_type_match.group(0).replace("O", "0")
    else:
        doc_type = None

    return {
        'DMS_ID': dms_pattern.search(text).group(0) if dms_pattern.search(text) else None,
        'VERSION': ver_pattern.search(text).group(1) if ver_pattern.search(text) else None,
        'DATE': date_pattern.search(text).group(1) if date_pattern.search(text) else None,
        'EXPORTED_DATE': exported_date_pattern.search(text).group(1) if exported_date_pattern.search(text) else None,
        'DOC_TYPE': doc_type,
        'CONFIDENTIALITY': classification_pattern.search(text).group(1) if classification_pattern.search(text) else None,
        'APPROVED': bool(approved_pattern.search(text))
    }

# Create dataframe from cropped text 
df_ocr = df_cropped_text['extracted_text'].apply(extract_matches).apply(pd.Series)


df_combined = df_pdfplumber.where(df_pdfplumber.notna(), df_ocr)

df_combined

# OCR - Tesseract

In [None]:
import pandas as pd 
import os
import fitz  # PyMuPDF
import pytesseract  # OCR
from PIL import Image
import io
import re

# Path to local PDF files
pdf_folder = r"C:\Users\emlu\OneDrive - Netcompany\Desktop\Virtual Tech Vestas\RAG_LLM_POC_v1\data\Vestas_RTP\Documents\Documents"

# Create Regex Patterns
dms_pattern = re.compile(r"\b\d{4}-\d{4}\b")
ver_pattern = re.compile(r"\bVER[:\s]+(\d{2})\b", re.IGNORECASE)
date_pattern = re.compile(r"Date[:\s]*([\d]{4}-[\d]{2}-[\d]{2})", re.IGNORECASE)
exported_pattern = re.compile(r"Exported from DMS.*?(\d{4}-\d{2}-\d{2})", re.IGNORECASE)
doc_type_pattern = re.compile(r"\bT\d{2}\b")
classification_pattern = re.compile(r"\b(Confidential|Restricted)\b(?![a-zA-Z])", re.IGNORECASE)
approved_pattern = re.compile(r"\bApproved\b", re.IGNORECASE)

# Results list
results = []

for filename in os.listdir(pdf_folder):
    if filename.lower().endswith(".pdf"):
        filepath = os.path.join(pdf_folder, filename)
        doc = fitz.open(filepath)

        # Take the first page as an image
        pix = doc[0].get_pixmap(dpi=500)
        img_data = Image.open(io.BytesIO(pix.tobytes("png")))

        # OCR with pytesseract
        ocr_text = pytesseract.image_to_string(img_data)

        # Metadata extraction
        dms_match = dms_pattern.search(ocr_text)
        ver_match = ver_pattern.search(ocr_text)
        date_match = date_pattern.search(ocr_text)
        exported_match = exported_pattern.search(ocr_text)
        doc_type_match = doc_type_pattern.search(ocr_text)
        classification_match = classification_pattern.search(ocr_text)
        approved_match = approved_pattern.search(ocr_text)

        result = {
            "filename": filename,
            "DMS No.": dms_match.group(0) if dms_match else None,
            "Ver": ver_match.group(1) if ver_match else None,
            "Date (Doc hist)": date_match.group(1) if date_match else None,
            "Date (Exported from DMS)": exported_match.group(1) if exported_match else None,
            "Doc type": doc_type_match.group(0) if doc_type_match else None,
            "Classification": classification_match.group(1).capitalize() if classification_match else None,
            "Approved": "Y" if approved_match else "N" 
        }

        results.append(result)

# Convert to DataFrame
df_ocr = pd.DataFrame(results)


In [49]:
df_combined = df_pdfplumber.where(df_pdfplumber.notna(), df_ocr)

df_combined

Unnamed: 0,filename,DMS No.,Ver,Date (Doc hist),Date (Exported from DMS),Doc type,Classification,Approved
0,0078-6200_V07 - 0078-6200_4MW Mk3E Setting and...,0078-6200,7,,2024-12-02,T07,Confidential,Y
1,0079-6646 09. SII-Nacelle (yearly).pdf,0079-6646,10,2024-07-25,2024-10-24,T09,Confidential,Y
2,0109-7505_V05 - TIS_4MW Mk3 Ready to Protect C...,0109-7505,5,,2024-10-15,T12,Confidential,Y
3,0138-1002_V04 - Battery Cell Impedance Testing...,0138-1002,4,2023-09-05,2024-12-02,T09,Confidential,Y
4,Firmware revision 5.2 in the 230 V UPS.pdf,0088-1273,0,2019-12-12,2024-12-11,T09,Restricted,Y
5,Installation and operation - Smart-UP RT.PDF,0030-8011,0,,2024-12-11,,Confidential,Y
6,Rotor locking system.pdf,0077-2293,3,2022-12-03,2025-03-03,T09,Restricted,Y
7,"SM,CIM4449, repl. battery and cable.pdf",0109-6919,1,2021-11-16,2024-12-12,,Confidential,Y
8,TIS_4MW RtoP Recovery Flow Chart (Mk3A-B-E).pdf,0109-7505,5,,2024-10-15,,Confidential,Y
9,User guide for the ready-to_x0002_protect (Rto...,0079-9386,0,2019-02-28,2024-10-15,T09,Restricted,Y


# Tesseract w functions

In [None]:
import pandas as pd
import os
import fitz  # PyMuPDF
import pytesseract  # OCR
from PIL import Image
import io
import re

def extract_pdf_data(pdf_folder):
    # Create Regex Patterns
    dms_pattern = re.compile(r"\b\d{4}-\d{4}\b")
    ver_pattern = re.compile(r"\bVER[:\s]+(\d{2})\b", re.IGNORECASE)
    date_pattern = re.compile(r"Date[:\s]*([\d]{4}-[\d]{2}-[\d]{2})", re.IGNORECASE)
    exported_pattern = re.compile(r"Exported from DMS.*?(\d{4}-\d{2}-\d{2})", re.IGNORECASE)
    doc_type_pattern = re.compile(r"\bT\d{2}\b")
    classification_pattern = re.compile(r"\b(Confidential|Restricted)\b(?![a-zA-Z])", re.IGNORECASE)
    approved_pattern = re.compile(r"\bApproved\b", re.IGNORECASE)

    # Results list
    results = []

    # Process each PDF file in the folder
    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith(".pdf"):
            filepath = os.path.join(pdf_folder, filename)
            doc = fitz.open(filepath)

            # Metadata extraction
            dms_match = dms_pattern.search(ocr_text)
            ver_match = ver_pattern.search(ocr_text)
            date_match = date_pattern.search(ocr_text)
            exported_match = exported_pattern.search(ocr_text)
            doc_type_match = doc_type_pattern.search(ocr_text)
            classification_match = classification_pattern.search(ocr_text)
            approved_match = approved_pattern.search(ocr_text)

            result = {
                "filename": filename,
                "DMS No.": dms_match.group(0) if dms_match else None,
                "Ver": ver_match.group(1) if ver_match else None,
                "Date (Doc hist)": date_match.group(1) if date_match else None,
                "Date (Exported from DMS)": exported_match.group(1) if exported_match else None,
                "Doc type": doc_type_match.group(0) if doc_type_match else None,
                "Classification": classification_match.group(1).capitalize() if classification_match else None,
                "Approved": "Y" if approved_match else "N"
            }

            results.append(result)

    # Convert results to DataFrame
    df_ocr = pd.DataFrame(results)
    return df_ocr

In [None]:
pdf_folder = r"C:\Users\emlu\OneDrive - Netcompany\Desktop\Virtual Tech Vestas\RAG_LLM_POC_v1\data\Vestas_RTP\Documents\Documents"
df_ocr = extract_pdf_data(pdf_folder)

In [53]:
# Check Outout
df_ocr

Unnamed: 0,filename,DMS No.,Ver,Date (Doc hist),Date (Exported from DMS),Doc type,Classification,Approved
0,0078-6200_V07 - 0078-6200_4MW Mk3E Setting and...,0078-6200,7,,2024-12-02,T07,Confidential,Y
1,0079-6646 09. SII-Nacelle (yearly).pdf,0079-6646,10,2024-07-25,2024-10-24,,Confidential,Y
2,0109-7505_V05 - TIS_4MW Mk3 Ready to Protect C...,0109-7505,5,,2024-10-15,T09,Confidential,Y
3,0138-1002_V04 - Battery Cell Impedance Testing...,0138-1002,4,2023-09-05,2024-12-02,T09,Confidential,Y
4,Firmware revision 5.2 in the 230 V UPS.pdf,0088-1273,0,2019-12-12,2024-12-11,,Restricted,Y
5,Installation and operation - Smart-UP RT.PDF,0030-8011,0,,2024-12-11,,Restricted,Y
6,Rotor locking system.pdf,0077-2293,3,2022-12-03,2025-03-03,T09,Restricted,Y
7,"SM,CIM4449, repl. battery and cable.pdf",0109-6919,1,2021-11-16,2024-12-12,,Confidential,Y
8,TIS_4MW RtoP Recovery Flow Chart (Mk3A-B-E).pdf,0109-7505,5,,2024-10-15,,Restricted,Y
9,User guide for the ready-to_x0002_protect (Rto...,0079-9386,0,2019-02-28,2024-10-15,,Restricted,Y


# -------------------------------------------------------------

# Final 

In [None]:
import os
import re
import pdfplumber
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import pandas as pd

# Path to the folder containing PDFs
pdf_folder = r"C:\Users\emlu\OneDrive - Netcompany\Desktop\Virtual Tech Vestas\RAG_LLM_POC_v1\data\Vestas_RTP\Documents\Documents"

# Regex patterns
patterns = {
    "DMS_ID": re.compile(r"\b\d{4}-\d{4}\b"),
    "VERSION": re.compile(r"\bVER[:\s]+(\d{2})\b", re.IGNORECASE),
    "DATE": re.compile(r"Date[:\s]*([\d]{4}-[\d]{2}-[\d]{2})", re.IGNORECASE),
    "EXPORTED_DATE": re.compile(r"Exported from DMS.*?(\d{4}-\d{2}-\d{2})", re.IGNORECASE),
    "DOC_TYPE": re.compile(r"\bT(?:\d{2}|[oO]\d)\b"),
    "CONFIDENTIALITY": re.compile(r"\b(Confidential|Restricted)\b(?![a-zA-Z])", re.IGNORECASE),
    "APPROVED": re.compile(r"\bApproved\b", re.IGNORECASE)
}

def extract_fields(text: str) -> dict:
    """Extract metadata fields from text using regex patterns."""
    doc_type_match = patterns["DOC_TYPE"].search(text)
    doc_type = doc_type_match.group(0).replace("O", "0") if doc_type_match else None
    return {
        "DMS_ID": match(patterns["DMS_ID"], text),
        "VERSION": match(patterns["VERSION"], text, 1),
        "DATE": match(patterns["DATE"], text, 1),
        "EXPORTED_DATE": match(patterns["EXPORTED_DATE"], text, 1),
        "DOC_TYPE": doc_type,
        "CONFIDENTIALITY": match(patterns["CONFIDENTIALITY"], text, 1, capitalize=True),
        "APPROVED": "Y" if patterns["APPROVED"].search(text) else None
    }


def match(pattern, text, group_index=0, capitalize=False):
    """Searches for a regex pattern in the given text and returns the matched group."""
    match = pattern.search(text)
    if match:
        value = match.group(group_index)
        return value.capitalize() if capitalize else value
    return None


def extract_text_pdfplumber(folder: str) -> pd.DataFrame:
    """Extract text metadata using pdfplumber."""
    data = []
    for file in os.listdir(folder):
        if file.lower().endswith(".pdf"):
            with pdfplumber.open(os.path.join(folder, file)) as pdf:
                text = pdf.pages[0].extract_text() or ""
            fields = {"FILENAME": file}  
            fields.update(extract_fields(text))
            data.append(fields)
    return pd.DataFrame(data)


def extract_text_from_images(folder: str) -> pd.DataFrame:
    """Extract text metadata from cropped image regions using OCR."""
    data = []
    for file in os.listdir(folder):
        if file.lower().endswith(".pdf"):
            try:
                doc = fitz.open(os.path.join(folder, file))
                if doc.page_count == 0:
                    continue
                page = doc.load_page(0)
                pix = page.get_pixmap(dpi=400)
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                cropped = crop_bottom_right(img)
                text = pytesseract.image_to_string(cropped, lang="eng")
                fields = {"FILENAME": file}  
                fields.update(extract_fields(text)) 
                data.append(fields)
                doc.close()
            except Exception as e:
                print(f"Failed to process {file}: {e}")
    return pd.DataFrame(data)


def crop_bottom_right(image: Image.Image) -> Image.Image:
    """Crop bottom-right 5% x 50% of the image and rotate it 90° clockwise."""
    w, h = image.size
    crop_box = (int(w * 0.95), int(h * 0.5), w, h)
    return image.crop(crop_box).transpose(Image.ROTATE_270)

# Step 1: Extract metadata using pdfplumber
df_pdfplumber = extract_text_pdfplumber(pdf_folder)

# Step 2: Extract metadata using OCR
df_ocr = extract_text_from_images(pdf_folder)

# Step 3: Combine both dataframes, preferring pdfplumber values
df_combined = df_pdfplumber.where(df_pdfplumber.notna(), df_ocr)

# Check result
df_combined

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Unnamed: 0,FILENAME,DMS_ID,VERSION,DATE,EXPORTED_DATE,DOC_TYPE,CONFIDENTIALITY,APPROVED
0,0078-6200_V07 - 0078-6200_4MW Mk3E Setting and...,0078-6200,7,,2024-12-02,T07,Confidential,Y
1,0079-6646 09. SII-Nacelle (yearly).pdf,0079-6646,10,2024-07-25,2024-10-24,T09,Confidential,Y
2,0109-7505_V05 - TIS_4MW Mk3 Ready to Protect C...,0109-7505,5,,2024-10-15,T12,Confidential,Y
3,0138-1002_V04 - Battery Cell Impedance Testing...,0138-1002,4,2023-09-05,2024-12-02,T09,Confidential,Y
4,Firmware revision 5.2 in the 230 V UPS.pdf,0088-1273,0,2019-12-12,2024-12-11,T09,Restricted,Y
5,Installation and operation - Smart-UP RT.PDF,0030-8011,0,,2024-12-11,T09,Confidential,Y
6,Rotor locking system.pdf,0077-2293,3,2022-12-03,2025-03-03,T09,Restricted,Y
7,"SM,CIM4449, repl. battery and cable.pdf",0109-6919,1,2021-11-16,2024-12-12,T09,Confidential,Y
8,TIS_4MW RtoP Recovery Flow Chart (Mk3A-B-E).pdf,0109-7505,5,,2024-10-15,T09,Confidential,Y
9,User guide for the ready-to_x0002_protect (Rto...,0079-9386,0,2019-02-28,2024-10-15,T09,Restricted,Y


# ------------------------------------------------------------

# New

# OCR - EasyOCR

In [3]:
import pandas as pd
import os
import fitz  # PyMuPDF
import easyocr  # OCR - Erstat pytesseract med easyocr
from PIL import Image
import io
import numpy as np  # Importér numpy
import re

# Path to local PDF files
pdf_folder = r"C:\Users\emlu\OneDrive - Netcompany\Desktop\Virtual Tech Vestas\RAG_LLM_POC_v1\data\Vestas_RTP\Documents\Documents"

# Create Regex Patterns
dms_pattern = re.compile(r"\b\d{4}-\d{4}\b")
ver_pattern = re.compile(r"\bVER[:\s]+(\d{2})\b", re.IGNORECASE)
date_pattern = re.compile(r"Date[:\s]*([\d]{4}-[\d]{2}-[\d]{2})", re.IGNORECASE)
exported_pattern = re.compile(r"Exported from DMS.*?(\d{4}-\d{2}-\d{2})", re.IGNORECASE)
doc_type_pattern = re.compile(r"\bT\d{2}\b")
classification_pattern = re.compile(r"\b(Confidential|Restricted)\b(?![a-zA-Z])", re.IGNORECASE)
approved_pattern = re.compile(r"\bApproved\b", re.IGNORECASE)

# Initialize easyocr Reader
reader = easyocr.Reader(['en'])  # This tells easyocr to use the English language model

# Results list
results = []

for filename in os.listdir(pdf_folder):
    if filename.lower().endswith(".pdf"):
        filepath = os.path.join(pdf_folder, filename)
        doc = fitz.open(filepath)

        # Take the first page as an image
        pix = doc[0].get_pixmap(dpi=500)
        img_data = Image.open(io.BytesIO(pix.tobytes("png")))

        # Convert the PIL image to a numpy array
        img_array = np.array(img_data)

        # OCR with easyocr
        ocr_result = reader.readtext(img_array)  # easyocr accepts numpy array

        # Extract the text from easyocr results
        ocr_text = " ".join([text[1] for text in ocr_result])  # Join all the OCR text fragments

        # Metadata extraction
        dms_match = dms_pattern.search(ocr_text)
        ver_match = ver_pattern.search(ocr_text)
        date_match = date_pattern.search(ocr_text)
        exported_match = exported_pattern.search(ocr_text)
        doc_type_match = doc_type_pattern.search(ocr_text)
        classification_match = classification_pattern.search(ocr_text)
        approved_match = approved_pattern.search(ocr_text)

        result = {
            "filename": filename,
            "DMS No.": dms_match.group(0) if dms_match else None,
            "Ver": ver_match.group(1) if ver_match else None,
            "Date (Doc hist)": date_match.group(1) if date_match else None,
            "Date (Exported from DMS)": exported_match.group(1) if exported_match else None,
            "Doc type": doc_type_match.group(0) if doc_type_match else None,
            "Classification": classification_match.group(1).capitalize() if classification_match else None,
            "Approved": "Y" if approved_match else "N" 
        }

        results.append(result)

# Convert to DataFrame
df_ocr = pd.DataFrame(results)


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [None]:
df_ocr

'RESTRICTED 8 Vestas g 1 2 User guide for the ready-to- [ protect (RtoP) system 3 Document no:: 0079-9386 VOO Class: RESTRICTED To9 Date: 2019-02-28 3 2 2 2 8 3 2 ] 1 8 2 1 TM Wind: It means the world to us: 2 VESTAS PROPRIETARY NOTICE: This document contains valuable confidential information of Vestas Wind Systems A/S_ It is protected by copyright law as an unpublished work. Vestas reserves all patent, copyright, trade secret, and other proprietary rights to it. The information in this document may not be used_ reproduced, or disclosed except if and to the extent rights are expressly granted by Vestas in and subject to applicable conditions Vestas disclaims all warranties except as expressly granted by written agreement and not responsible for unauthorized uses, for which it may pursue legal remedies against responsible parties. Type: writing'

# OCR - Crop pic + EasyOCR 

# Hybrid - pdfplumber + Tesseract

In [3]:
import pandas as pd
import os
import fitz  # PyMuPDF
import pytesseract  # OCR
from PIL import Image
import io
import re
import pdfplumber

# Path to local PDF files
pdf_folder = r"C:\Users\emlu\OneDrive - Netcompany\Desktop\Virtual Tech Vestas\RAG_LLM_POC_v1\data\Vestas_RTP\Documents\Documents"

# Create Regex Patterns
dms_pattern = re.compile(r"\b\d{4}-\d{4}\b")
ver_pattern = re.compile(r"\bVER[:\s]+(\d{2})\b", re.IGNORECASE)
date_pattern = re.compile(r"Date[:\s]*([\d]{4}-[\d]{2}-[\d]{2})", re.IGNORECASE)
exported_pattern = re.compile(r"Exported from DMS.*?(\d{4}-\d{2}-\d{2})", re.IGNORECASE)
doc_type_pattern = re.compile(r"\bT\d{2}\b")
classification_pattern = re.compile(r"\b(Confidential|Restricted)\b(?![a-zA-Z])", re.IGNORECASE)
approved_pattern = re.compile(r"\bApproved\b", re.IGNORECASE)

# Results list
results = []

for filename in os.listdir(pdf_folder):
    if filename.lower().endswith(".pdf"):
        filepath = os.path.join(pdf_folder, filename)

        # Step 1: Try extracting "Doc type" using pdfplumber
        doc_type_value = None
        try:
            with pdfplumber.open(filepath) as pdf:
                plumber_text = pdf.pages[0].extract_text() or ""
                doc_type_match_pdf = doc_type_pattern.search(plumber_text)
                if doc_type_match_pdf:
                    doc_type_value = doc_type_match_pdf.group(0)
        except Exception as e:
            print(f"pdfplumber failed on {filename}: {e}")

        # Step 2: Use OCR for everything (including Doc type if not already found)
        doc = fitz.open(filepath)
        pix = doc[0].get_pixmap(dpi=500)
        img_data = Image.open(io.BytesIO(pix.tobytes("png")))
        ocr_text = pytesseract.image_to_string(img_data)

        # Extract metadata
        dms_match = dms_pattern.search(ocr_text)
        ver_match = ver_pattern.search(ocr_text)
        date_match = date_pattern.search(ocr_text)
        exported_match = exported_pattern.search(ocr_text)
        classification_match = classification_pattern.search(ocr_text)
        approved_match = approved_pattern.search(ocr_text)

        # Only fallback to OCR for doc_type if not found by pdfplumber
        if not doc_type_value:
            doc_type_match_ocr = doc_type_pattern.search(ocr_text)
            if doc_type_match_ocr:
                doc_type_value = doc_type_match_ocr.group(0)

        result = {
            "filename": filename,
            "DMS No.": dms_match.group(0) if dms_match else None,
            "Ver": ver_match.group(1) if ver_match else None,
            "Date (Doc hist)": date_match.group(1) if date_match else None,
            "Date (Exported from DMS)": exported_match.group(1) if exported_match else None,
            "Doc type": doc_type_value,
            "Classification": classification_match.group(1).capitalize() if classification_match else None,
            "Approved": "Y" if approved_match else "N"
        }

        results.append(result)

# Convert to DataFrame
df_hybrid = pd.DataFrame(results)


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

KeyboardInterrupt: 

In [19]:
# Check Output
df_hybrid

Unnamed: 0,filename,DMS No.,Ver,Date (Doc hist),Date (Exported from DMS),Doc type,Classification,Approved
0,0078-6200_V07 - 0078-6200_4MW Mk3E Setting and...,0078-6200,7,,2024-12-02,T07,Confidential,Y
1,0079-6646 09. SII-Nacelle (yearly).pdf,0079-6646,10,2024-07-25,2024-10-24,T09,Confidential,Y
2,0109-7505_V05 - TIS_4MW Mk3 Ready to Protect C...,0109-7505,5,,2024-10-15,T12,Confidential,Y
3,0138-1002_V04 - Battery Cell Impedance Testing...,0138-1002,4,2023-09-05,2024-12-02,T09,Confidential,Y
4,Firmware revision 5.2 in the 230 V UPS.pdf,0088-1273,0,2019-12-12,2024-12-11,T09,Restricted,Y
5,Installation and operation - Smart-UP RT.PDF,0030-8011,0,,2024-12-11,,Restricted,Y
6,Rotor locking system.pdf,0077-2293,3,2022-12-03,2025-03-03,T09,Restricted,Y
7,"SM,CIM4449, repl. battery and cable.pdf",0109-6919,1,2021-11-16,2024-12-12,,Confidential,Y
8,TIS_4MW RtoP Recovery Flow Chart (Mk3A-B-E).pdf,0109-7505,5,,2024-10-15,,Restricted,Y
9,User guide for the ready-to_x0002_protect (Rto...,0079-9386,0,2019-02-28,2024-10-15,T09,Restricted,Y


# Hybrid - pdfplumber + EasyOCR

In [3]:
df_hybrid

Unnamed: 0,filename,DMS No.,Ver,Date (Doc hist),Date (Exported from DMS),Doc type,Classification,Approved
0,0078-6200_V07 - 0078-6200_4MW Mk3E Setting and...,0078-6200,7.0,,,,Confidential,N
1,0079-6646 09. SII-Nacelle (yearly).pdf,0079-6646,,2024-07-25,,T09,Confidential,N
2,0109-7505_V05 - TIS_4MW Mk3 Ready to Protect C...,0109-7505,,,,T12,Confidential,N
3,0138-1002_V04 - Battery Cell Impedance Testing...,0138-1002,,2023-09-05,,T09,Confidential,N
4,Firmware revision 5.2 in the 230 V UPS.pdf,0088-1273,,2019-12-12,,T09,Restricted,N
5,Installation and operation - Smart-UP RT.PDF,,,,,,Confidential,N
6,Rotor locking system.pdf,0077-2293,,2022-12-03,,T09,Restricted,N
7,"SM,CIM4449, repl. battery and cable.pdf",0109-6919,,2021-11-16,,,Confidential,N
8,TIS_4MW RtoP Recovery Flow Chart (Mk3A-B-E).pdf,,,,,,Confidential,N
9,User guide for the ready-to_x0002_protect (Rto...,0079-9386,,2019-02-28,,T09,Restricted,N


In [47]:
import re
from typing import Optional, Dict

# Define regex patterns
REGEX_PATTERNS = {
    "dms": re.compile(r"\b\d{4}-\d{4}\b"),
    "ver": re.compile(r"\bVER[:\s]+(\d{2})\b", re.IGNORECASE),
    "date": re.compile(r"Date[:\s]*([\d]{4}-[\d]{2}-[\d]{2})", re.IGNORECASE),
    "exported": re.compile(r"Exported from DMS.*?(\d{4}-\d{2}-\d{2})", re.IGNORECASE),
    "doc_type": re.compile(r"\bT\d{2}\b"),
    "classification": re.compile(r"\b(Confidential|Restricted)\b(?![a-zA-Z])", re.IGNORECASE),
    "approved": re.compile(r"\bApproved\b", re.IGNORECASE),
}

def search_pattern(pattern: re.Pattern, primary_text: str, fallback_text: str) -> Optional[re.Match]:
    """
    Search for a regex pattern in primary text and fallback to secondary if not found.
    """
    match = pattern.search(primary_text)
    if not match:
        match = pattern.search(fallback_text)
    return match

def extract_metadata(pdf_text: str, ocr_text: str) -> Dict[str, Optional[str]]:
    """
    Extracts metadata fields from either PDF text or OCR text.
    """
    return {
        "dms": search_pattern(REGEX_PATTERNS["dms"], pdf_text, ocr_text).group(0) if search_pattern(REGEX_PATTERNS["dms"], pdf_text, ocr_text) else None,
        "ver": search_pattern(REGEX_PATTERNS["ver"], pdf_text, ocr_text).group(1) if search_pattern(REGEX_PATTERNS["ver"], pdf_text, ocr_text) else None,
        "date": search_pattern(REGEX_PATTERNS["date"], pdf_text, ocr_text).group(1) if search_pattern(REGEX_PATTERNS["date"], pdf_text, ocr_text) else None,
        "exported": search_pattern(REGEX_PATTERNS["exported"], pdf_text, ocr_text).group(1) if search_pattern(REGEX_PATTERNS["exported"], pdf_text, ocr_text) else None,
        "doc_type": search_pattern(REGEX_PATTERNS["doc_type"], pdf_text, ocr_text).group(0) if search_pattern(REGEX_PATTERNS["doc_type"], pdf_text, ocr_text) else None,
        "classification": search_pattern(REGEX_PATTERNS["classification"], pdf_text, ocr_text).group(1) if search_pattern(REGEX_PATTERNS["classification"], pdf_text, ocr_text) else None,
        "approved": "Yes" if search_pattern(REGEX_PATTERNS["approved"], pdf_text, ocr_text) else "No"
    }



In [48]:
pdf_folder = r"C:\Users\emlu\OneDrive - Netcompany\Desktop\Virtual Tech Vestas\RAG_LLM_POC_v1\data\Vestas_RTP\Documents\Documents"

df_ocr_fast = extract_metadata(pdf_folder)

TypeError: extract_metadata() missing 1 required positional argument: 'ocr_text'