In [3]:
import fitz  # PyMuPDF
import re
import pandas as pd
from collections import OrderedDict
import json

In [4]:
pdf_path = "2.ArcBiox™ BGF30–HA - ABMcomposite.pdf"
def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()

    return text

In [7]:
def extract_text_from_pdf(pdf_path):
    # Open the PDF
    document = fitz.open(pdf_path)
    text = ""
    
    # Extract text from each page
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text("text")  
    
    return text

# Path to your PDF
pdf_path = 'SpecialChem/2.ArcBiox™ BGF30–HA - ABMcomposite.pdf'
pdf_text = extract_text_from_pdf(pdf_path)
 
print("Extracted Text:\n", pdf_text)

Extracted Text:
 ArcBiox™ BGF30–HA
 Technical DataSheet | Supplied by ABMcomposite
 
ArcBiox™ BGF30-HA by ABMcomposite is a high temperature resistant, biodegradable, bio-polyester blend reinforced
with long glass fiber. It contains 25% bio-based content. It offers good flowability, high stiffness & strength, excellent
flatness & dimensional stability and high temperature resistance. It can be processed by injection molding. ArcBiox™
BGF30-HA is recommended for automotive, consumer electronics and furniture industry.
 
 
 
ArcBiox™ BGF30–HA Properties
 
 
Product Type
Polyester > Polyester, Bio-based
Product Status
COMMERCIAL
Applications/ Recommended for
Furnitures
Electronics / Computers
Automotive
Injection molding - thermoplastics
Biodegradable
Yes
Bio Based
Yes
Bio Based Content (%)
25
Key Features
Biodegradable
Dimensional stability, Good
Filled, Glass Fiber
Flow, Good
Renewable Resource Content
Stiffness, High
Strength, High
Physical
Value & Unit
Test Condition
Test Method
Densi

In [8]:
def parse_pdf_text(pdf_text):
 
    structured_data = {
        "Product Details": {
            "Product Name": "",
            "Product Type": "",
            "Product Status":"",
            "Applications/ Recommended for":"",
            "Biodegradable":"",
            "Bio Based":"",
            "Bio Based Content (%)":"",
            "Key Features":""
        },
        
        "Product Properties": {
            "Physical": {},
            "Mechanical": {},
            "Thermal": {},
            "Injection Molding": {}
        }
    }
    
    # Extract and categorize the information
    lines = pdf_text.splitlines()

   # Patterns to extract Value, Test Condition, and Test Method
    value_unit_pattern = re.compile(r"([\d\.]+\s*[\w/°]+)")
    test_condition_pattern = re.compile(r"(Test Condition:.*)")
    test_method_pattern = re.compile(r"(ISO \d{3,4})")
    
    for i, line in enumerate(lines):

# For the product properties
        if "Technical DataSheet" in line:
            structured_data["Product Details"]["Product Name"] = lines[i-1].strip()
        
        elif "Product Type" in line:
            structured_data["Product Details"]["Product Type"] = lines[i+1].strip()
            
        elif "Product Status" in line:
            structured_data["Product Details"]["Product Status"] = lines[i+1].strip()    
        
        elif "Applications/ Recommended for" in line:
            applications = []
            i += 1  # Move to the next line
            while i < len(lines) and not any(keyword in lines[i] for keyword in ["Biodegradable", "Bio Based", "Key Features"]):
                applications.append(lines[i].strip())
                i += 1
            structured_data["Product Details"]["Applications/ Recommended for"] = " | ".join(applications)
            continue   
            
        elif "Biodegradable" in line:
            structured_data["Product Details"]["Biodegradable"] = lines[i+1].strip()
        
        elif "Bio Based" in line:
            structured_data["Product Details"]["Bio Based"] = lines[i+1].strip()
        
        elif "Key Features" in line:
            structured_data["Product Details"]["Key Features"] = lines[i+1].strip()
            
            
            #--------------------------------------------------------
            #Product Properties
        
# For Product Properties
        elif "Density" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            test_method = test_method_pattern.search(lines[i+2])
            structured_data["Product Properties"]["Physical"]["Density"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",  
                "Test Method": test_method.group() if test_method else ""
            }
        
        elif "Tensile Strength at Break" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            test_method = test_method_pattern.search(lines[i+2])
            structured_data["Product Properties"]["Mechanical"]["Tensile Strength at Break"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "", 
                "Test Method": test_method.group() if test_method else ""
            }
            
        elif "Flexural Strength" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            test_method = test_method_pattern.search(lines[i+2])
            structured_data["Product Properties"]["Mechanical"]["Flexural Strength"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",  
                "Test Method": test_method.group() if test_method else ""
            }
        
        elif "Impact Strength, Izod" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            test_method = test_method_pattern.search(lines[i+2])
            structured_data["Product Properties"]["Mechanical"]["Impact Strength, Izod"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",  
                "Test Method": test_method.group() if test_method else ""
            }
        
        elif "Impact Strength, NotchedIzod" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            test_method = test_method_pattern.search(lines[i+2])
            structured_data["Product Properties"]["Mechanical"]["Impact Strength, NotchedIzod"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",  
                "Test Method": test_method.group() if test_method else ""
            }
            
        elif "Flexural Modulus" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            test_method = test_method_pattern.search(lines[i+2])
            structured_data["Product Properties"]["Mechanical"]["Flexural Modulus"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",  
                "Test Method": test_method.group() if test_method else ""
            }
            
        elif "Vicat Softening Temperature" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            test_condition = test_condition_pattern.search(lines[i+1])
            test_method = test_method_pattern.search(lines[i+2])
            structured_data["Product Properties"]["Thermal"]["Vicat Softening Temperature"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": test_condition.group() if test_condition else "",
                "Test Method": test_method.group() if test_method else ""
            }
        
        elif "Heat Deflection Temperature (HDT)" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            test_condition = test_condition_pattern.search(lines[i+1])
            test_method = test_method_pattern.search(lines[i+2])
            structured_data["Product Properties"]["Thermal"]["Heat Deflection Temperature (HDT)"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": test_condition.group() if test_condition else "",
                "Test Method": test_method.group() if test_method else ""
            }
            
        # For Injection Molding Properties
        elif "Melt Temperature" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            structured_data["Product Properties"]["Injection Molding"]["Melt Temperature"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",  
                "Test Method": "" 
            }
        
        elif "Nozzle Temperature" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            structured_data["Product Properties"]["Injection Molding"]["Nozzle Temperature"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "", 
                "Test Method": ""   
            }
        
        elif "Feed Temperature" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            structured_data["Product Properties"]["Injection Molding"]["Feed Temperature"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",   
                "Test Method": ""  
            }
            
        elif "Compression Section" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            structured_data["Product Properties"]["Injection Molding"]["Compression Section"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",  
                "Test Method": ""  
            }
        
        elif "Metering Section" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            structured_data["Product Properties"]["Injection Molding"]["Metering Section"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",   
                "Test Method": ""   
            }
        
        elif "Holding Pressure" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            structured_data["Product Properties"]["Injection Molding"]["Holding Pressure"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",  
                "Test Method": ""  
            }
            
        elif "Mold Temperature" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            structured_data["Product Properties"]["Injection Molding"]["Mold Temperature"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",   
                "Test Method": ""  
            }
            
        elif "Feed Throat Temperature" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            structured_data["Product Properties"]["Injection Molding"]["Feed Throat Temperature"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",  
                "Test Method": ""  
            }
        
        elif "Drying Temperature,Dehumidifying Dryer" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            structured_data["Product Properties"]["Injection Molding"]["Drying Temperature,Dehumidifying Dryer"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "", 
                "Test Method": ""  
            }
        
        elif "Screw Speed" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            structured_data["Product Properties"]["Injection Molding"]["Screw Speed"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",  
                "Test Method": ""  
            }
        
        elif "Back Pressure" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            structured_data["Product Properties"]["Injection Molding"]["Back Pressure"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "", 
                "Test Method": ""   
            }
        
        elif "Drying Time,Dehumidifying Dryer" in line:
            value_unit = value_unit_pattern.search(lines[i+1])
            structured_data["Product Properties"]["Injection Molding"]["Drying Time,Dehumidifying Dryer"] = {
                "Value & Unit": value_unit.group() if value_unit else "",
                "Test Condition": "",  
                "Test Method": ""  
            }
        
    return structured_data

# Example usage
pdf_text = extract_text_from_pdf(pdf_path)
structured_data = parse_pdf_text(pdf_text)
 
import pprint
pprint.pprint(structured_data)

{'Product Details': {'Applications/ Recommended for': 'Furnitures | '
                                                      'Electronics / Computers '
                                                      '| Automotive | '
                                                      'Injection molding - '
                                                      'thermoplastics',
                     'Bio Based': '25',
                     'Bio Based Content (%)': '',
                     'Biodegradable': 'Dimensional stability, Good',
                     'Key Features': 'Biodegradable',
                     'Product Name': 'ArcBiox™ BGF30–HA',
                     'Product Status': 'COMMERCIAL',
                     'Product Type': 'Polyester > Polyester, Bio-based'},
 'Product Properties': {'Injection Molding': {'Back Pressure': {'Test Condition': '',
                                                                'Test Method': '',
                                                             