In [None]:
import pdfplumber
import re

def parse_quote_pdf(pdf_path):
    """Parse a quote PDF and return a list of dicts for each quote line."""
    # Open the PDF and extract text from the first page
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        text = page.extract_text()
    lines = text.splitlines()

    # Identify the start of the table and the end (marked by "Verktygskostnad:")
    idx_tool = None
    idx_table_start = None
    for i, line in enumerate(lines):
        if idx_table_start is None and ("Profil nr" in line or (line.strip().startswith("Profil") and "kg/m" in line)):
            idx_table_start = i
        if line.strip().startswith("Verktygskostnad:"):
            idx_tool = i
            break

    if idx_table_start is None or idx_tool is None:
        return []  # return empty if parsing failed
    
    #print(f"Table starts at line {idx_table_start}, ends at line {idx_tool}")

    # Determine where the actual data lines begin (after the header lines)
    idx_data_start = idx_table_start
    for j in range(idx_table_start, idx_tool):
        # The last header line often contains "Årsvolym" or "SEK" or ends with "g:"
        if "Årsvolym" in lines[j] or lines[j].strip() == "SEK" or lines[j].strip().endswith("g:"):
            idx_data_start = j + 1

    # Extract table data lines, filtering out header artifacts
    data_lines = []
    for k in range(idx_data_start, idx_tool):
        line_clean = lines[k].strip()
        if not line_clean or any(line_clean.startswith(h) for h in ["Profil", "Kund ref.", "Vikt", "Pris/st", "Kap", "SEK"]):
            continue

        parts = line_clean.split()
        if len(parts) >= 7:
            # We assume last 6 parts are numeric values, first is profile name
            profile_name = " ".join(parts[:-6])
            row = [
                profile_name,
                parts[-6],  # Vikt
                parts[-5],  # Längd
                parts[-4],  # Kap+truml
                parts[-3],  # Årsvolym
                parts[-2],  # Pris
                parts[-1],  # Legering
            ]
            data_lines.append(row)   
        
    #print("Data lines extracted:", data_lines)

    # Extract metadata fields from lines outside the table
    metadata = {}
    for line in lines:
        line_stripped = line.strip()
        if line_stripped.startswith("Datum:"):
            metadata["Datum"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Vår referens:"):
            metadata["Vår referens"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Er referens:"):
            metadata["Er referens"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Kund:"):
            metadata["Kund"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Verktygskostnad:"):
            metadata["Verktygskostnad"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Legering:"):
            # Store separately to use if table rows don't include alloy
            metadata["Legering_all"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Toleranser:"):
            metadata["Toleranser"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Ytbehandling:"):
            metadata["Ytbehandling"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Lev. längd:"):
            metadata["Lev. längd"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Lev. villkor:"):
            metadata["Lev. villkor"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Lev. tid:"):
            metadata["Lev. tid"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("NOT:"):
            metadata["NOT"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Betalningsvillkor:"):
            metadata["Betalningsvillkor"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Giltighet:"):
            metadata["Giltighet"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Allmänna villkor:"):
            metadata["Allmänna villkor"] = line_stripped.split(":", 1)[1].strip()
        elif line_stripped.startswith("Råvara:"):
            metadata["Råvara (euro/kg)"] = line_stripped.split(":", 1)[1].strip()

    #print("Metadata extracted:", metadata)

    # Parse each data line into fields and combine with metadata
    rows = []
    for parts in data_lines:
        #parts = re.split(r'\s{2,}', line.strip())
        # Assign each column from the split parts
        profil = parts[0] if len(parts) > 0 else ""
        vikt = parts[1] if len(parts) > 1 else ""
        längd = parts[2] if len(parts) > 2 else ""
        kap_truml = parts[3] if len(parts) > 3 else ""
        årsvolym = parts[4] if len(parts) > 4 else ""
        pris_st = parts[5] if len(parts) > 5 else ""
        legering = parts[6] if len(parts) > 6 else metadata.get("Legering_all", "")
        # Combine into one record (dict)
        row = {
            "Profil_namn": profil,
            "Vikt (kg/m)": vikt,
            "Längd (m)": längd,
            "Kap + truml (Pris/st)": kap_truml,
            "ca antal (Årsvolym st)": årsvolym,
            "Pris (kr/st) SEK": pris_st,
            "Legering": legering
        }
        # Add metadata fields to the record
        for key, value in metadata.items():
            # Use "Legering_all" only if "Legering" field was empty
            if key == "Legering_all":
                continue
            row[key] = value
        rows.append(row)
    return rows


In [2]:
# Example usage: parsing all PDFs and combining results
all_quotes = []
for file_num in range(1, 51):
    file_name = f"/Users/mageshbabu/Desktop/Projects/project_odens/Test files/PdfNAP ({file_num}).pdf"
    rows = parse_quote_pdf(file_name)
    all_quotes.extend(rows)

# `all_quotes` now contains a list of dictionaries, each representing one quote line with metadata.
# You can further convert this to CSV or DataFrame as needed.

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

In [3]:
import pandas as pd

# Convert list of dictionaries to DataFrame
df_all_quotes = pd.DataFrame(all_quotes)

# Save to CSV
csv_path = "all_quotes_extracted.csv"
df_all_quotes.to_csv(csv_path, index=False)

print(f"✅ Saved {len(df_all_quotes)} rows to {csv_path}")


✅ Saved 198 rows to all_quotes_extracted.csv


In [8]:
df_all_quotes.head()

Unnamed: 0,Profil_namn,Vikt (kg/m),Längd (m),Kap + truml (Pris/st),ca antal (Årsvolym st),Pris (kr/st) SEK,Legering,Datum,Vår referens,Er referens,...,Toleranser,Ytbehandling,Lev. längd,Lev. villkor,Lev. tid,NOT,Betalningsvillkor,Giltighet,Allmänna villkor,Råvara (euro/kg)
0,Karmlist,1342,238,78,40000,292,Rå,2025-02-25,Erik Svensson,Maria Lindgren,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Halmstad,första 8-10 veckor från order därefter 5-6 veckor,"Minst 15000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,4 Euro / kg"
1,Karmlist,1342,258,78,40000,305,Rå,2025-02-25,Erik Svensson,Maria Lindgren,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Halmstad,första 8-10 veckor från order därefter 5-6 veckor,"Minst 15000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,4 Euro / kg"
2,Karmlist,1342,238,78,85000,288,Rå,2025-02-25,Erik Svensson,Maria Lindgren,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Halmstad,första 8-10 veckor från order därefter 5-6 veckor,"Minst 15000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,4 Euro / kg"
3,Karmlist,1342,258,78,85000,301,Rå,2025-02-25,Erik Svensson,Maria Lindgren,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Halmstad,första 8-10 veckor från order därefter 5-6 veckor,"Minst 15000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,4 Euro / kg"
4,Bottenlist,1156,215,72,65000,265,Rå,2025-03-01,Gustav Bergström,Anders Johansson,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Markaryd,första 7-9 veckor från order därefter 4-5 veckor,"Minst 18000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,2 Euro / kg"


In [9]:
df_all_quotes.describe(include='all')

Unnamed: 0,Profil_namn,Vikt (kg/m),Längd (m),Kap + truml (Pris/st),ca antal (Årsvolym st),Pris (kr/st) SEK,Legering,Datum,Vår referens,Er referens,...,Toleranser,Ytbehandling,Lev. längd,Lev. villkor,Lev. tid,NOT,Betalningsvillkor,Giltighet,Allmänna villkor,Råvara (euro/kg)
count,198,198,198,198,198,198,198,198,198,198,...,198,198,198,198,198,198,198,198,198,198
unique,50,46,52,18,48,76,1,50,50,50,...,1,1,1,2,4,6,1,21,1,8
top,Karmlist,1265,242,75,85000,278,Rå,2025-02-25,Erik Svensson,Maria Lindgren,...,EN 755-9,EN-AW-6063-T5,Längder enligt ovan,Ex Works Markaryd,första 8-10 veckor från order därefter 5-6 veckor,"Minst 15000 bitar kapade, ok att blanda längde...","30 dagar netto. Dröjsmålsränta 7,5%",Offererade priser gäller fast för leveranser t...,NAPFV2017,"3,3 Euro / kg"
freq,4,8,12,24,8,7,198,4,4,4,...,198,198,198,100,146,80,198,20,198,80


In [10]:
df_all_quotes.isnull().sum()

Profil_namn               0
Vikt (kg/m)               0
Längd (m)                 0
Kap + truml (Pris/st)     0
ca antal (Årsvolym st)    0
Pris (kr/st) SEK          0
Legering                  0
Datum                     0
Vår referens              0
Er referens               0
Kund                      0
Verktygskostnad           0
Toleranser                0
Ytbehandling              0
Lev. längd                0
Lev. villkor              0
Lev. tid                  0
NOT                       0
Betalningsvillkor         0
Giltighet                 0
Allmänna villkor          0
Råvara (euro/kg)          0
dtype: int64

In [None]:
df_verktygskostnad = df_all_quotes["Verktygskostnad"].unique()
df_verktygskostnad

array(['14500kr', '12800kr', '15200kr', '13500kr', '16800kr', '12400kr',
       '14800kr', '16500kr', '13900kr', '11500kr', '12900kr', '14200kr',
       '15600kr', '13600kr', '13700kr', '14000kr', '14900kr', '13400kr',
       '14600kr', '13100kr', '12600kr', '15100kr', '14100kr', '13800kr',
       '12700kr', '14300kr', '13300kr', '14700kr', '15400kr', '13000kr',
       '13200kr', '15300kr', '12200kr'], dtype=object)

In [24]:
df_kund = df_all_quotes["Kund"].unique()
df_kund

array(['Sapa Fönster AB, Vetlanda', 'Elitfönster, Växjö',
       'Traryd Fönster, Lenhovda', 'Inwido Windows, Åstorp',
       'SP Fönster, Edsbyn', 'Norrfönster AB, Skellefteå',
       'Fönsterspecialisten, Kalmar', 'Westcoast Windows, Falkenberg',
       'Alufönster, Malmö', 'Glaskonst Fönster, Jönköping',
       'Väderskydd AB, Sundsvall', 'Nordiska Fönster, Umeå',
       'Moderna Fönster, Linköping', 'Kvalitetsfönster, Borås',
       'Nordform, Göteborg', 'Svealand Fönster, Örebro',
       'Glaspartier AB, Stockholm', 'Söderport Fönster, Helsingborg',
       'Prisma Fönster, Västerås', 'Ljungby Fönster, Ljungby',
       'Allglas AB, Norrköping', 'Arkitekturaluminium AB, Uppsala',
       'Energifönster, Falun', 'Kronobergs Fönster, Älmhult',
       'Norrlandsfönster, Luleå', 'Glas & Fasad AB, Karlstad',
       'Växjöfönster, Växjö', 'Smålandsfönster, Vetlanda',
       'Örnsköldsviks Fönster, Örnsköldsvik', 'Glasmästarna AB, Gävle',
       'Fönsterdesign, Uddevalla', 'Arkitektglas, Es

In [19]:
df_date.head(20)

197    2025-02-24
195    2025-02-24
194    2025-02-24
196    2025-02-24
0      2025-02-25
2      2025-02-25
1      2025-02-25
3      2025-02-25
5      2025-03-01
4      2025-03-01
7      2025-03-01
6      2025-03-01
8      2025-03-03
9      2025-03-03
10     2025-03-03
11     2025-03-03
15     2025-03-05
13     2025-03-05
12     2025-03-05
14     2025-03-05
Name: Datum, dtype: object

In [20]:
df_date.tail(20)

177    2025-06-18
175    2025-06-18
174    2025-06-18
176    2025-06-18
178    2025-06-20
179    2025-06-20
181    2025-06-20
180    2025-06-20
184    2025-06-22
185    2025-06-22
182    2025-06-22
183    2025-06-22
186    2025-06-25
187    2025-06-25
189    2025-06-25
188    2025-06-25
190    2025-06-28
191    2025-06-28
192    2025-06-28
193    2025-06-28
Name: Datum, dtype: object