In [1]:
import pdfplumber
import pandas as pd

In [2]:
def extract_table_lines(lines):
    """Extracts table data lines from the PDF lines."""
    idx_tool = None
    idx_table_start = None
    for i, line in enumerate(lines):
        if idx_table_start is None and ("Profil nr" in line or (line.strip().startswith("Profil") and "kg/m" in line)):
            idx_table_start = i
        if line.strip().startswith("Verktygskostnad:"):
            idx_tool = i
            break
    if idx_table_start is None or idx_tool is None:
        return []
    idx_data_start = idx_table_start
    for j in range(idx_table_start, idx_tool):
        if "Årsvolym" in lines[j] or lines[j].strip() == "SEK" or lines[j].strip().endswith("g:"):
            idx_data_start = j + 1
    data_lines = []
    for k in range(idx_data_start, idx_tool):
        line_clean = lines[k].strip()
        if not line_clean or any(line_clean.startswith(h) for h in ["Profil", "Kund ref.", "Vikt", "Pris/st", "Kap", "SEK"]):
            continue
        parts = line_clean.split()
        if len(parts) >= 7:
            profile_name = " ".join(parts[:-6])
            row = [
                profile_name,
                parts[-6],  # Vikt
                parts[-5],  # Längd
                parts[-4],  # Kap+truml
                parts[-3],  # Årsvolym
                parts[-2],  # Pris
                parts[-1],  # Legering
            ]
            data_lines.append(row)
    return data_lines


In [3]:

def extract_general_customer_info(lines):
    """Extracts general customer info from lines above the table."""
    metadata = {}
    for line in lines:
        line_stripped = line.strip()
        if line_stripped.startswith("Datum:"):
            metadata["Datum"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Vår referens:"):
            metadata["Vår referens"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Er referens:"):
            metadata["Er referens"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Kund:"):
            metadata["Kund"] = line_stripped.split(":", 1)[1].strip()
    return metadata


In [4]:

def extract_product_extra_details(lines):
    """Extracts extra product details from lines below the table."""
    metadata = {}
    for line in lines:
        line_stripped = line.strip()
        if line_stripped.startswith("Verktygskostnad:"):
            metadata["Verktygskostnad"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Legering:"):
            metadata["Legering_all"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Toleranser:"):
            metadata["Toleranser"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Ytbehandling:"):
            metadata["Ytbehandling"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Lev. längd:"):
            metadata["Lev. längd"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Lev. villkor:"):
            metadata["Lev. villkor"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Lev. tid:"):
            metadata["Lev. tid"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("NOT:"):
            metadata["NOT"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Betalningsvillkor:"):
            metadata["Betalningsvillkor"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Giltighet:"):
            metadata["Giltighet"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Allmänna villkor:"):
            metadata["Allmänna villkor"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Råvara:"):
            metadata["Råvara (euro/kg)"] = line_stripped.split(":", 1)[1].strip()
    return metadata


In [5]:

def combine_rows(data_lines, general_info, extra_details):
    """Combines table data lines with metadata into row dicts."""
    # Merge all metadata
    metadata = {**general_info, **extra_details}
    rows = []
    for parts in data_lines:
        profil = parts[0] if len(parts) > 0 else ""
        vikt = parts[1] if len(parts) > 1 else ""
        längd = parts[2] if len(parts) > 2 else ""
        kap_truml = parts[3] if len(parts) > 3 else ""
        årsvolym = parts[4] if len(parts) > 4 else ""
        pris_st = parts[5] if len(parts) > 5 else ""
        legering = parts[6] if len(parts) > 6 else metadata.get("Legering_all", "")
        row = {
            "Profil_namn": profil,
            "Vikt (kg/m)": vikt,
            "Längd (m)": längd,
            "Kap + truml (Pris/st)": kap_truml,
            "ca antal (Årsvolym st)": årsvolym,
            "Pris (kr/st) SEK": pris_st,
            "Legering": legering
        }
        for key, value in metadata.items():
            if key == "Legering_all":
                continue
            row[key] = value
        rows.append(row)
    return rows


In [6]:

def parse_quote_pdf(pdf_path):
    """Main function to parse PDF and return structured rows."""
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        text = page.extract_text()
    lines = text.splitlines()
    data_lines = extract_table_lines(lines)
    general_info = extract_general_customer_info(lines)
    extra_details = extract_product_extra_details(lines)
    rows = combine_rows(data_lines, general_info, extra_details)
    return rows

In [None]:
def extract_all_quotes_to_csv(
    pdf_folder,
    file_prefix="PdfNAP",
    file_count=50,
    csv_path="all_quotes_extracted.csv"
):
    """
    Parses all quote PDFs in the specified folder, saves the result to a CSV, and returns the DataFrame.
    """
    all_quotes = []
    for file_num in range(1, file_count + 1):
        file_name = f"{pdf_folder}/{file_prefix} ({file_num}).pdf"
        rows = parse_quote_pdf(file_name)
        all_quotes.extend(rows)
    df_all_quotes = pd.DataFrame(all_quotes)
    df_all_quotes.to_csv(csv_path, index=False)
    print(f"✅ Saved {len(df_all_quotes)} rows to {csv_path}")
    return df_all_quotes

# Usage example:
pdf_folder = "/Users/mageshbabu/Desktop/Projects/project_odens/data_preparation/sample_data_files"
df_all_quotes = extract_all_quotes_to_csv(pdf_folder)
df_all_quotes.head(10)