bax db load


In [6]:
import json
import sqlite3
from pathlib import Path
import pandas as pd

REPO_ROOT = Path(".")          # adjust if notebook is inside repo root already
DB_PATH = REPO_ROOT / "db" / "odm.sqlite"
JSONL_PATH = REPO_ROOT / "data" / "raw" / "bax" / "bax_products.json"

conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON;")

print("DB:", DB_PATH.resolve())
print("JSONL:", JSONL_PATH.resolve())

DB: /Users/luukhoogeveen/Desktop/python/Online-Data-Mining/db/odm.sqlite
JSONL: /Users/luukhoogeveen/Desktop/python/Online-Data-Mining/data/raw/bax/bax_products.json


In [7]:
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON;")

conn.execute("""
INSERT OR REPLACE INTO competitor (competitor_id, name, country, base_url)
VALUES (1, 'bax-shop.nl', 'NL', 'https://www.bax-shop.nl')
""")
conn.commit()

In [12]:
import json
from pathlib import Path

SUPPORT_JSON = Path("data/raw/bax/bax_products.json")
BAX_ID = 1

if SUPPORT_JSON.exists():
    with SUPPORT_JSON.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            r = json.loads(line)
            t = r.get("type")

            if t == "CUSTOMER_SERVICE":
                conn.execute("""
                    INSERT INTO customer_service (
                        competitor_id, listing_id, scraped_at,
                        shipping_included, free_shipping_threshold_amt,
                        pickup_point_available, delivery_shipping_available,
                        delivery_courier_available, cooling_off_days,
                        free_returns, warranty_provider, warranty_duration_months,
                        customer_service_url
                    )
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """, (
                    BAX_ID,
                    None,  # listing_id optional; set if support is per-listing
                    r.get("scraped_at"),
                    r.get("shipping_included"),
                    r.get("free_shipping_threshold_amt"),
                    r.get("pickup_point_available"),
                    r.get("delivery_shipping_available"),
                    r.get("delivery_courier_available"),
                    r.get("cooling_off_days"),
                    r.get("free_returns"),
                    r.get("warranty_provider"),
                    r.get("warranty_duration_months"),
                    r.get("customer_service_url"),
                ))

            elif t == "EXPERT_SUPPORT":
                conn.execute("""
                    INSERT INTO expert_support (
                        competitor_id, scraped_at, source_url,
                        expert_chat_available, phone_support_available,
                        email_support_available, in_store_support,
                        expert_support_text
                    )
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                """, (
                    BAX_ID,
                    r.get("scraped_at"),
                    r.get("source_url"),
                    r.get("expert_chat_available"),
                    r.get("phone_support_available"),
                    r.get("email_support_available"),
                    r.get("in_store_support"),
                    r.get("expert_support_text"),
                ))

    conn.commit()
    print("Imported support JSON:", SUPPORT_JSON)
else:
    print("Missing file:", SUPPORT_JSON)

Imported support JSON: data/raw/bax/bax_products.json


In [13]:
def has_col(table, col):
    return col in {r["name"] for r in conn.execute(f"PRAGMA table_info({table})")}

def add_col(table, col, coltype):
    if not has_col(table, col):
        conn.execute(f"ALTER TABLE {table} ADD COLUMN {col} {coltype}")

def uniq_idx(table, col):
    conn.execute(f"CREATE UNIQUE INDEX IF NOT EXISTS ux_{table}_{col} ON {table}({col})")

In [14]:
# keys you emit in JSON
add_col("scraperun", "scrape_run_key", "INTEGER")
add_col("product", "product_key", "INTEGER")
add_col("productlisting", "listing_key", "INTEGER")
add_col("category", "category_key", "INTEGER")

uniq_idx("scraperun", "scrape_run_key")
uniq_idx("product", "product_key")
uniq_idx("productlisting", "listing_key")
uniq_idx("category", "category_key")

conn.commit()

In [None]:
BAX_ID = 1

# key -> id maps
scrape_run_id_by_key = {}
product_id_by_key = {}
listing_id_by_key = {}

def get_id(sql, params):
    row = conn.execute(sql, params).fetchone()
    return row[0] if row else None

with JSONL_PATH.open("r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        r = json.loads(line)
        t = r.get("type")

        # SCRAPERUN
        if t == "run":
            srk = r.get("scrape_run_id")
            conn.execute("""
                INSERT OR IGNORE INTO scraperun (started_at, gpt_context_hash, crawler_version, notes, scrape_run_key)
                VALUES (?, ?, ?, ?, ?)
            """, (
                r.get("started_at"),
                r.get("git_commit_hash"),
                r.get("crawler_version"),
                r.get("notes"),
                srk,
            ))
            conn.commit()
            scrape_run_id_by_key[srk] = get_id(
                "SELECT scrape_run_id FROM scraperun WHERE scrape_run_key=?",
                (srk,)
            )

        # PRODUCT
        elif t == "product":
            # Use source_url as keys
            pk = r.get("source_url")
            lk = r.get("source_url")
            srk = r.get("scrape_run_id")

            # Insert product
            conn.execute("""
                INSERT OR IGNORE INTO product (canonical_name, brand, model, product_key)
                VALUES (?, ?, ?, ?)
            """, (
                r.get("canonical_name"),
                r.get("brand"),
                r.get("model"),
                pk
            ))
            conn.commit()
            product_id_by_key[pk] = get_id(
                "SELECT product_id FROM product WHERE product_key=?",
                (pk,)
            )

            # Insert productlisting
            conn.execute("""
                INSERT OR IGNORE INTO productlisting (
                    competitor_id, category_id, product_url, title_on_page,
                    image_url_src, img_url_cdn, gtin_ean_upc_on_page,
                    description_clean, listing_key
                )
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                BAX_ID,
                None,  # category_id
                r.get("source_url"),
                r.get("title"),
                r.get("image_url"),
                None,
                r.get("gtin"),
                r.get("description"),
                lk
            ))
            conn.commit()
            listing_id_by_key[lk] = get_id(
                "SELECT listing_id FROM productlisting WHERE listing_key=?",
                (lk,)
            )

            # Insert pricesnapshot
            listing_id = listing_id_by_key.get(lk)
            scrape_run_id = scrape_run_id_by_key.get(srk)
            if listing_id and scrape_run_id:
                conn.execute("""
                    INSERT INTO pricesnapshot (
                        listing_id, scrape_run_id, scraped_at, currency,
                        current_price, base_price, discount_amount, discount_percent,
                        price_text, in_stock, stock_status_text
                    )
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """, (
                    listing_id,
                    scrape_run_id,
                    r.get("scraped_at"),
                    r.get("currency"),
                    r.get("current_price"),
                    r.get("base_price"),
                    r.get("discount_amount"),
                    r.get("discount_percent"),
                    r.get("price_text"),
                    r.get("in_stock"),
                    r.get("stock_status_text"),
                ))

            # Insert review (aggregate)
            if r.get("rating_value") is not None:
                conn.execute("""
                    INSERT INTO review (
                        listing_id, rating_value, rating_scale, review_count
                    )
                    VALUES (?, ?, ?, ?)
                """, (
                    listing_id,
                    r.get("rating_value"),
                    r.get("rating_scale"),
                    r.get("review_count"),
                ))

            # Insert customer_service
            conn.execute("""
                INSERT INTO customer_service (
                    competitor_id, listing_id, scraped_at,
                    shipping_included, free_shipping_threshold_amt,
                    pickup_point_available, delivery_shipping_available,
                    delivery_courier_available, cooling_off_days,
                    free_returns, warranty_provider, warranty_duration_months,
                    customer_service_url
                )
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                BAX_ID,
                listing_id,
                r.get("scraped_at"),
                r.get("shipping_included"),
                r.get("free_shipping_threshold_amt"),
                r.get("pickup_point_available"),
                r.get("delivery_shipping_available"),
                r.get("delivery_courier_available"),
                r.get("cooling_off_days"),
                r.get("free_returns"),
                r.get("warranty_provider"),
                r.get("warranty_duration_months"),
                r.get("customer_service_url"),
            ))

            # Insert productmatch
            product_id = product_id_by_key.get(pk)
            if product_id and listing_id:
                conn.execute("""
                    INSERT INTO productmatch (
                        product_id, listing_id, match_method, match_score, matched_at
                    )
                    VALUES (?, ?, ?, ?, ?)
                """, (
                    product_id,
                    listing_id,
                    "auto",
                    1.0,
                    r.get("scraped_at"),
                ))

conn.commit()
print("Loaded Bax data")

NameError: name 'JSON_PATH' is not defined