# Information Extraction technique: Pure Regular Expression method
ROQUETTE notebook

In [None]:
# install PyPDF2 package if it's not in your IDE
!pip install PyPDF2

Import necessary packages and Define the functions

In [None]:
# import necessary packages
import re # regular expression(re) to formulate patterns.
import PyPDF2 # PyPDF2 to extract information from PDF files to plain text.
import pandas as pd # pandas to store extracted information to data frame data type

# Dictionary of Patterns for each attribute - Define patterns. Note that, if new files are added, check if the pattern is included or not.
PATTERNS = {
    "Product name": [
        r"1\.1 Product identifier\s*:\s*Product name\s*:\s*([^\n]+)",
        r"Product name : ([^P].+?)(?=  Product sheet|$)",
        r"Product name\s*:\s*(\w+\s*®\s*\w+)",
        r"Product name\s*:\s*(.+)(?=\s+Product No)"
        ], # List of Patterns for PRODUCT NAME
    "Chemical": [
        r"Chemical name\s*:\s*(.+)",
        r"\b([A-Z][a-z]+(?:\s[a-z]+)*)\b(?=\s+>=)"
        ], # List of Patterns for CHEMICAL
    "CAS-No.": [
        r"CAS\s*-No.\s*:\s*(.+)",
        r"\b(\d{4,5}\s*-\s*\d{2}\s*-\s*\d{1})\b"
        ], # List of Patterns for CAS-NO.
    "EC-No.": [
        r"EC No.\s*:\s*(.+)",
        r"(Polymer)"], # List of Patterns for EC-NO.
    "Market Segment": [
        r"7\.3 Specific end use\(s\) : (.+?)\s+SECTION",
        r"Identified uses\s*:\s*Uses advised against\s*:\s*(.*?)(?=\s*No data available\.)"
        ], # List of Patterns for MARKET SEGMENT
    "Supplier ID Description": [
        r"Supplier\s*:\s*(.+)"
        ], # Pattern for SUPPLIER
    "UN Code": [
        r"14\.1 UN number : (.+?)\.",
        r"(The product is not covered by international regulation on the transport of dangerous goods (IMDG, IATA, ADR/RID)..)",
        r"(The product is not covered by international regulation on the transport of\s+dangerous goods \(IMDG, IATA, ADR/RID\)\. \.)"
        ], # List of Patterns for UN CODE
    "Hazard": [
        r"(The product has not been classified as dangerous)",
        r"(The product has not been classified as dange rous according to GHS.)",
        r"(The product has not bee n classified as dangerous)"
        ] # List of Patterns for HAZARD (Binary attribute)
}

def extract_text_from_pdf(file_path = str) -> str:
    # Define a function to extract everything from PDF to txt. file
    with open(file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        all_text = [page.extract_text() for page in pdf_reader.pages]
    return "\n".join(all_text)

def search_patterns(patterns = str, text = str) -> str:
    # iterate through all the texts to find patterns listed in the PATTERNS dictionary
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1).replace('\n', ' ').strip()
    return "No information found"

def extract_info_from_text(pdf_text = str) -> pd:
    extracted_info = {}
    for attribute, patterns in PATTERNS.items():
        extracted_info[attribute] = search_patterns(patterns, pdf_text)

    # Set hazard value to 0 if it's found, otherwise 1
    extracted_info["Hazard"] = 0 if extracted_info["Hazard"] != "No information found" else 1
    return extracted_info

def extract_info_from_pdfs(file_paths = str) -> str:
    data = []
    for file_path in file_paths:
        pdf_text = extract_text_from_pdf(file_path)
        info = extract_info_from_text(pdf_text)
        info["File Path"] = file_path
        data.append(info)
    return data


Execution Code Block

In [None]:
file_paths = [
    "Roquette_SDS_BE_MICROCEL® MC-101_000000201661_EN.PDF",
    "Roquette_SDS_CH_XYLISORB® XTAB 240_000000200370_EN.PDF",
    "Roquette_SDS_DE_CLEARGUM® CK 2020_000000200980_EN.PDF",
    "Roquette_SDS_DE_PEARLITOL® BioPharma_000000201568_EN.PDF",
    "Roquette_SDS_FI_NEOSORB® 70-70 B_000000200314_EN.PDF",
    "Roquette_SDS_FR_NEOSORB® P200 SD_000000202155_EN.PDF",
    "Roquette_SDS_FR_PEARLITOL® 50C_000000200967_EN.PDF",
    "Roquette_SDS_FR_SODIUM GLUCONATE CRYSTALLINE F_000000200049_EN.PDF",
    "Roquette_SDS_GB_NEOSORB® 70-02 B_000000200323_EN.PDF",
    "Roquette_SDS_GB_ReadiLYCOAT® D Clear 110.01_000000200803_EN.PDF",
    "Roquette_SDS_GHS_TABULOSE SC® 611_000000202042_EN.PDF",
    "Roquette_SDS_MY_NEOSORB® PF_000000200349_EN.PDF",
    "Roquette_SDS_NL_CLEARAM® CI 20 00_000000200211_EN.PDF",
    "Roquette_SDS_NL_GLUTALYS® HQ - CORN GLUTEN_000000201374_EN.PDF",
    "Roquette_SDS_RU_NUTRALYS® TP-C_000000201939_EN.PDF",
    "Roquette_SDS_SE_POTATO STARCH SUPRA NP BACTERIOLOGICAL_000000200446_EN.PDF",
    "Roquette_SDS_SG_CLEARGUM® CO 03_000000200289_EN.PDF",
    "Roquette_SDS_SG_GLUCIDEX® 39_000000200678_EN.PDF",
    "Roquette_SDS_TW_VITAL WHEAT GLUTEN_000000201059_EN.PDF",
    "Roquette_SDS_TW_MANNITOL 60_000000200337_EN.PDF",
    "Roquette_SDS_TW_复配增稠稳定水分保持剂 - LYCAGEL VS 720 PMX_000000202290_EN.PDF",
    "Roquette_SDS_SG_NUTRIOSE® FM 15S_000000202079_EN.PDF",
    "Roquette_SDS_SK_GLUCOSE SYRUP 6080_000000200523_EN.PDF"
]  # Replace with your actual PDF file paths

# Extract information from all PDF files
data = extract_info_from_pdfs(file_paths)

# Convert the list of dictionaries into a DataFrame and display the DataFrame
df = pd.DataFrame(data)
df

## DataFrame to MS Excel

In [None]:
df.to_excel("extracted_roquette.xlsx", index = False)

## DataFrame to SQL

In [None]:
from sqlalchemy import create_engine
engine = create_engine("sqlite://", echo = False)

In [None]:
df.to_sql(name = "product roquette", con = engine)

In [None]:
from sqlalchemy import text
with engine.connect() as conn:
    conn.execute(text("SELECT * FROM product roquette")).fetchall()

In [None]:
query = "SELECT* FROM product roquette"

In [None]:
df_filtered = pd.read_sql_query(query, con = engine)
df_filtered