In [1]:
import pdfplumber
import pandas as pd

In [2]:
def extract_table_lines(lines):
    """
    Extracts product table rows from PDF text lines.

    Finds the main product table in the PDF text, skips headers and irrelevant lines,
    and parses each data row into [profile name, weight, length, cutting cost, annual volume, price, alloy].

    Args:
        lines (list of str): Lines of text extracted from a PDF page.

    Returns:
        list of list of str: Parsed table rows, each as a list of 7 string values.
    """
    idx_table_start = None
    idx_table_end = None
    for i, line in enumerate(lines):
        if idx_table_start is None and ("Profil nr" in line):
            idx_table_start = i
        if line.strip().startswith("Verktygskostnad:"):
            idx_table_end = i
            break
    if idx_table_start is None or idx_table_end is None:
        return []

    data_lines = []
    for k in range(idx_table_start, idx_table_end):
        line_clean = lines[k].strip()
        if not line_clean or any(line_clean.startswith(h) for h in ["Profil", "Kund ref.", "ref.", "Vikt", "Pris/st", "Kap", "SEK"]):
            continue
        parts = line_clean.split()
        if len(parts) >= 7:
            profile_name = " ".join(parts[:-6])
            row = [
                profile_name,
                parts[-6],  # Vikt
                parts[-5],  # Längd
                parts[-4],  # Kap+truml
                parts[-3],  # Årsvolym
                parts[-2],  # Pris
                parts[-1],  # Legering
            ]
            data_lines.append(row)
    return data_lines


In [3]:

def extract_general_customer_info(lines):
    """
    Extracts general customer information from PDF text lines.
    Scans the lines above the product table to find and extract fields such as date, reference, and customer name.

    Args:
        lines (list of str): Lines of text extracted from a PDF page.

    Returns:
        dict: Dictionary with keys like 'Datum', 'Vår referens', 'Er referens', and 'Kund' and their corresponding values.
    """
    metadata = {}
    for line in lines:
        line_stripped = line.strip()
        if line_stripped.startswith("Datum:"):
            metadata["Datum"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Vår referens:"):
            metadata["Vår referens"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Er referens:"):
            metadata["Er referens"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Kund:"):
            metadata["Kund"] = line_stripped.split(":", 1)[1].strip()
    return metadata


In [4]:

def extract_product_extra_details(lines):
    """
    Extracts extra product details from PDF text lines.

    Scans lines (typically below the product table) to find and extract additional product-related fields.

    Args:
        lines (list of str): Lines of text extracted from a PDF page.

    Returns:
        dict: Dictionary with extra product details found in the lines.
    """
    metadata = {}
    for line in lines:
        line_stripped = line.strip()
        if line_stripped.startswith("Verktygskostnad:"):
            metadata["Verktygskostnad"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Legering:"):
            metadata["Legering_all"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Toleranser:"):
            metadata["Toleranser"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Ytbehandling:"):
            metadata["Ytbehandling"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Lev. längd:"):
            metadata["Lev. längd"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Lev. villkor:"):
            metadata["Lev. villkor"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Lev. tid:"):
            metadata["Lev. tid"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("NOT:"):
            metadata["NOT"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Betalningsvillkor:"):
            metadata["Betalningsvillkor"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Giltighet:"):
            metadata["Giltighet"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Allmänna villkor:"):
            metadata["Allmänna villkor"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Råvara:"):
            metadata["Råvara (euro/kg)"] = line_stripped.split(":", 1)[1].strip()
    return metadata


In [5]:

def combine_rows(data_lines, general_info, extra_details):
    """
    Combines table data rows with extracted metadata.

    Merges each product row with general customer info and extra product details into a single dictionary.

    Args:
        data_lines (list of list of str): Parsed product table rows.
        general_info (dict): General customer information.
        extra_details (dict): Additional product details.

    Returns:
        list of dict: List of combined rows, each as a dictionary with all relevant fields.
    """
    # Merge all metadata
    metadata = {**general_info, **extra_details}
    rows = []
    for parts in data_lines:
        profil = parts[0] if len(parts) > 0 else ""
        vikt = parts[1] if len(parts) > 1 else ""
        längd = parts[2] if len(parts) > 2 else ""
        kap_truml = parts[3] if len(parts) > 3 else ""
        årsvolym = parts[4] if len(parts) > 4 else ""
        pris_st = parts[5] if len(parts) > 5 else ""
        legering = parts[6] if len(parts) > 6 else metadata.get("Legering_all", "")
        row = {
            "Profil_namn": profil,
            "Vikt (kg/m)": vikt,
            "Längd (m)": längd,
            "Kap + truml (Pris/st)": kap_truml,
            "ca antal (Årsvolym st)": årsvolym,
            "Pris (kr/st) SEK": pris_st,
            "Legering": legering
        }
        for key, value in metadata.items():
            if key == "Legering_all":
                continue
            row[key] = value
        rows.append(row)
    return rows


In [6]:

def parse_quote_pdf(pdf_path):
    """
    Parses a quote PDF and returns structured product and metadata rows.

    Extracts text from the first page of the PDF, parses the product table and metadata,
    and combines them into a list of dictionaries.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        list of dict: List of rows with product and metadata fields.
    """
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        text = page.extract_text()
    lines = text.splitlines()
    data_lines = extract_table_lines(lines)
    general_info = extract_general_customer_info(lines)
    extra_details = extract_product_extra_details(lines)
    rows = combine_rows(data_lines, general_info, extra_details)
    return rows

In [8]:
def extract_all_quotes_to_csv(
    pdf_folder,
    file_prefix="PdfNAP",
    file_count=50,
    csv_path="csv_files/all_quotes_extracted.csv"
):
    """
    Parses all quote PDFs in the specified folder, saves the result to a CSV, and returns the DataFrame.

    Args:
        pdf_folder (str): Path to the folder containing PDF files.
        file_prefix (str): Prefix of the PDF file names.
        file_count (int): Number of PDF files to process.
        csv_path (str): Path to save the resulting CSV file.

    Returns:
        pandas.DataFrame: DataFrame containing all extracted quote data.
    """
    all_quotes = []
    for file_num in range(1, file_count + 1):
        file_name = f"{pdf_folder}/{file_prefix} ({file_num}).pdf"
        rows = parse_quote_pdf(file_name)
        all_quotes.extend(rows)
    df_all_quotes = pd.DataFrame(all_quotes)
    df_all_quotes.to_csv(csv_path, index=False)
    print(f"✅ Saved {len(df_all_quotes)} rows to {csv_path}")
    return df_all_quotes

# Usage example:
pdf_folder = "/Users/mageshbabu/Desktop/Projects/ML_Powered_Price_Engine/data_preparation/sample_data_files"
df_all_quotes = extract_all_quotes_to_csv(pdf_folder)
df_all_quotes.head(10)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

✅ Saved 198 rows to csv_files/all_quotes_extracted.csv


Unnamed: 0,Profil_namn,Vikt (kg/m),Längd (m),Kap + truml (Pris/st),ca antal (Årsvolym st),Pris (kr/st) SEK,Legering,Datum,Vår referens,Er referens,...,Toleranser,Ytbehandling,Lev. längd,Lev. villkor,Lev. tid,NOT,Betalningsvillkor,Giltighet,Allmänna villkor,Råvara (euro/kg)
0,Karmlist,1342,238,78,40000,292,Rå,2025-02-25,Erik Svensson,Maria Lindgren,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Halmstad,första 8-10 veckor från order därefter 5-6 veckor,"Minst 15000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,4 Euro / kg"
1,Karmlist,1342,258,78,40000,305,Rå,2025-02-25,Erik Svensson,Maria Lindgren,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Halmstad,första 8-10 veckor från order därefter 5-6 veckor,"Minst 15000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,4 Euro / kg"
2,Karmlist,1342,238,78,85000,288,Rå,2025-02-25,Erik Svensson,Maria Lindgren,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Halmstad,första 8-10 veckor från order därefter 5-6 veckor,"Minst 15000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,4 Euro / kg"
3,Karmlist,1342,258,78,85000,301,Rå,2025-02-25,Erik Svensson,Maria Lindgren,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Halmstad,första 8-10 veckor från order därefter 5-6 veckor,"Minst 15000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,4 Euro / kg"
4,Bottenlist,1156,215,72,65000,265,Rå,2025-03-01,Gustav Bergström,Anders Johansson,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Markaryd,första 7-9 veckor från order därefter 4-5 veckor,"Minst 18000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,2 Euro / kg"
5,Bottenlist,1156,235,72,65000,272,Rå,2025-03-01,Gustav Bergström,Anders Johansson,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Markaryd,första 7-9 veckor från order därefter 4-5 veckor,"Minst 18000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,2 Euro / kg"
6,Bottenlist,1156,215,72,120000,261,Rå,2025-03-01,Gustav Bergström,Anders Johansson,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Markaryd,första 7-9 veckor från order därefter 4-5 veckor,"Minst 18000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,2 Euro / kg"
7,Bottenlist,1156,235,72,120000,268,Rå,2025-03-01,Gustav Bergström,Anders Johansson,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Markaryd,första 7-9 veckor från order därefter 4-5 veckor,"Minst 18000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,2 Euro / kg"
8,Täckprofil,1425,262,82,45000,315,Rå,2025-03-03,Sofia Ekholm,Johan Karlsson,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Halmstad,första 8-10 veckor från order därefter 5-6 veckor,"Minst 15000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,5 Euro / kg"
9,Täckprofil,1425,285,82,45000,328,Rå,2025-03-03,Sofia Ekholm,Johan Karlsson,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Halmstad,första 8-10 veckor från order därefter 5-6 veckor,"Minst 15000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,5 Euro / kg"
