In [None]:
import pdfplumber
import re

def parse_quote_pdf(pdf_path):
    """Parse a quote PDF and return a list of dicts for each quote line."""
    # Open the PDF and extract text from the first page
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        text = page.extract_text()
    lines = text.splitlines()

    # Identify the start of the table and the end (marked by "Verktygskostnad:")
    idx_tool = None
    idx_table_start = None
    for i, line in enumerate(lines):
        if idx_table_start is None and ("Profil nr" in line or (line.strip().startswith("Profil") and "kg/m" in line)):
            idx_table_start = i
        if line.strip().startswith("Verktygskostnad:"):
            idx_tool = i
            break

    if idx_table_start is None or idx_tool is None:
        return []  # return empty if parsing failed
    
    #print(f"Table starts at line {idx_table_start}, ends at line {idx_tool}")

    # Determine where the actual data lines begin (after the header lines)
    idx_data_start = idx_table_start
    for j in range(idx_table_start, idx_tool):
        # The last header line often contains "Årsvolym" or "SEK" or ends with "g:"
        if "Årsvolym" in lines[j] or lines[j].strip() == "SEK" or lines[j].strip().endswith("g:"):
            idx_data_start = j + 1

    # Extract table data lines, filtering out header artifacts
    data_lines = []
    for k in range(idx_data_start, idx_tool):
        line_clean = lines[k].strip()
        if not line_clean or any(line_clean.startswith(h) for h in ["Profil", "Kund ref.", "Vikt", "Pris/st", "Kap", "SEK"]):
            continue

        parts = line_clean.split()
        if len(parts) >= 7:
            # We assume last 6 parts are numeric values, first is profile name
            profile_name = " ".join(parts[:-6])
            row = [
                profile_name,
                parts[-6],  # Vikt
                parts[-5],  # Längd
                parts[-4],  # Kap+truml
                parts[-3],  # Årsvolym
                parts[-2],  # Pris
                parts[-1],  # Legering
            ]
            data_lines.append(row)   
        
    #print("Data lines extracted:", data_lines)

    # Extract metadata fields from lines outside the table
    metadata = {}
    for line in lines:
        line_stripped = line.strip()
        if line_stripped.startswith("Datum:"):
            metadata["Datum"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Vår referens:"):
            metadata["Vår referens"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Er referens:"):
            metadata["Er referens"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Kund:"):
            metadata["Kund"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Verktygskostnad:"):
            metadata["Verktygskostnad"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Legering:"):
            # Store separately to use if table rows don't include alloy
            metadata["Legering_all"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Toleranser:"):
            metadata["Toleranser"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Ytbehandling:"):
            metadata["Ytbehandling"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Lev. längd:"):
            metadata["Lev. längd"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Lev. villkor:"):
            metadata["Lev. villkor"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Lev. tid:"):
            metadata["Lev. tid"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("NOT:"):
            metadata["NOT"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Betalningsvillkor:"):
            metadata["Betalningsvillkor"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Giltighet:"):
            metadata["Giltighet"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Allmänna villkor:"):
            metadata["Allmänna villkor"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Råvara:"):
            metadata["Råvara (euro/kg)"] = line_stripped.split(":", 1)[1].strip()

    #print("Metadata extracted:", metadata)

    # Parse each data line into fields and combine with metadata
    rows = []
    for parts in data_lines:
        #parts = re.split(r'\s{2,}', line.strip())
        # Assign each column from the split parts
        profil = parts[0] if len(parts) > 0 else ""
        vikt = parts[1] if len(parts) > 1 else ""
        längd = parts[2] if len(parts) > 2 else ""
        kap_truml = parts[3] if len(parts) > 3 else ""
        årsvolym = parts[4] if len(parts) > 4 else ""
        pris_st = parts[5] if len(parts) > 5 else ""
        legering = parts[6] if len(parts) > 6 else metadata.get("Legering_all", "")
        # Combine into one record (dict)
        row = {
            "Profil_namn": profil,
            "Vikt (kg/m)": vikt,
            "Längd (m)": längd,
            "Kap + truml (Pris/st)": kap_truml,
            "ca antal (Årsvolym st)": årsvolym,
            "Pris (kr/st) SEK": pris_st,
            "Legering": legering
        }
        # Add metadata fields to the record
        for key, value in metadata.items():
            # Use "Legering_all" only if "Legering" field was empty
            if key == "Legering_all":
                continue
            row[key] = value
        rows.append(row)
    return rows


In [None]:
# Example usage: parsing all PDFs and combining results
all_quotes = []
for file_num in range(1, 51):
    file_name = f"/Users/mageshbabu/Desktop/Projects/project_odens/Test files/PdfNAP ({file_num}).pdf"
    rows = parse_quote_pdf(file_name)
    all_quotes.extend(rows)

# `all_quotes` now contains a list of dictionaries, each representing one quote line with metadata.
# You can further convert this to CSV or DataFrame as needed.

In [None]:
import pandas as pd

# Convert list of dictionaries to DataFrame
df_all_quotes = pd.DataFrame(all_quotes)

# Save to CSV
csv_path = "all_quotes_extracted.csv"
df_all_quotes.to_csv(csv_path, index=False)

print(f"✅ Saved {len(df_all_quotes)} rows to {csv_path}")
