In [27]:
from pathlib import Path
import json
import sqlite3
import pandas as pd
from IPython.display import display

# Repo root = current working directory (run notebook from repo root!)
REPO_ROOT = Path.cwd()
DB_PATH = REPO_ROOT / "db" / "odm.sqlite"

conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON;")

# MaxiAxi JSONL path (repo-relative)
MAXIAXI_DIR = REPO_ROOT / "data" / "raw" / "maxiaxi"
JSONL_PATH = MAXIAXI_DIR / "maxiaxi_items_20260121_093233.jsonl"

print("DB:", DB_PATH, "exists:", DB_PATH.exists())
print("JSONL:", JSONL_PATH, "exists:", JSONL_PATH.exists())


DB: /Users/feddekoster/Desktop/Assignment/Online-Data-Mining/db/odm.sqlite exists: True
JSONL: /Users/feddekoster/Desktop/Assignment/Online-Data-Mining/data/raw/maxiaxi/maxiaxi_items_20260121_093233.jsonl exists: True


In [28]:
def table_exists(table: str) -> bool:
    r = conn.execute(
        "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
        (table,)
    ).fetchone()
    return r is not None

def has_col(table, col):
    if not table_exists(table):
        return False
    return col in {r["name"] for r in conn.execute(f"PRAGMA table_info({table})")}

def add_col(table, col, coltype):
    if table_exists(table) and not has_col(table, col):
        conn.execute(f"ALTER TABLE {table} ADD COLUMN {col} {coltype}")

def uniq_idx(table, col):
    if table_exists(table):
        conn.execute(
            f"CREATE UNIQUE INDEX IF NOT EXISTS ux_{table}_{col} ON {table}({col})"
        )

def get_id(sql, params):
    row = conn.execute(sql, params).fetchone()
    return row[0] if row else None

def stable_int_key(s: str) -> int:
    """
    Stable positive integer key from a string (URL/category name/run_id),
    so it fits 'INTEGER' columns even if the source key is text.
    """
    if s is None:
        return None
    return abs(hash(s)) % (10**12)


In [29]:
# Add columns only if missing (won't break if they already exist)
add_col("scraperun", "scrape_run_key", "INTEGER")
add_col("product", "product_key", "INTEGER")
add_col("productlisting", "listing_key", "INTEGER")
add_col("category", "category_key", "INTEGER")

# Ensure uniqueness on the key columns
uniq_idx("scraperun", "scrape_run_key")
uniq_idx("product", "product_key")
uniq_idx("productlisting", "listing_key")
uniq_idx("category", "category_key")

conn.commit()
print("Table updates done.")


Table updates done.


In [31]:
COMPETITORS = [
    (1, "Bax-shop", "NL", "https://www.bax-shop.nl"),
    (2, "bol.com",  "NL", "https://www.bol.com"),
    (3, "MaxiAxi",  "NL", "https://www.maxiaxi.com"),
    (4, "Thomann",  "DE", "https://www.thomann.nl"),
]

MAXIAXI_ID = 3
MAXIAXI_COMPETITOR = next(c for c in COMPETITORS if c[0] == MAXIAXI_ID)

conn.execute(
    """
    INSERT OR REPLACE INTO competitor (competitor_id, name, country, base_url)
    VALUES (?, ?, ?, ?)
    """,
    MAXIAXI_COMPETITOR,
)
conn.commit()

print("Competitor inserted/updated:", MAXIAXI_COMPETITOR)


KeyboardInterrupt: 

In [None]:
# =========================
# Cell 5 â€” MaxiAxi JSONL import (COPY/PASTE)
# =========================

import json
import hashlib

MAXIAXI_ID = 3  # competitor_id for MaxiAxi

def table_cols(table: str) -> set[str]:
    return {r["name"] for r in conn.execute(f"PRAGMA table_info({table})")}

def insert_filtered(table: str, data: dict, or_ignore: bool = False):
    """
    Insert into table using only columns that actually exist in the DB.
    Uses INSERT OR IGNORE optionally to avoid duplicate constraint crashes.
    """
    cols = table_cols(table)
    payload = {k: v for k, v in data.items() if k in cols}

    if not payload:
        return

    keys = list(payload.keys())
    placeholders = ",".join(["?"] * len(keys))
    verb = "INSERT OR IGNORE" if or_ignore else "INSERT"
    sql = f"{verb} INTO {table} ({','.join(keys)}) VALUES ({placeholders})"
    conn.execute(sql, tuple(payload[k] for k in keys))

def stable_int_key(s: str) -> int:
    """
    Deterministic integer key (DO NOT use Python hash(): it changes per run).
    """
    if s is None:
        return None
    h = hashlib.md5(s.encode("utf-8")).hexdigest()
    return int(h[:12], 16)

def get_id(sql, params):
    row = conn.execute(sql, params).fetchone()
    return row[0] if row else None

# Maps (optional caches)
listing_id_by_url = {}       # product_url -> listing_id
category_id_by_key = {}      # category_key -> category_id

if not JSONL_PATH.exists():
    raise FileNotFoundError(f"Missing file: {JSONL_PATH.resolve()}")

# Cache PRAGMA results (performance + consistency)
cols_category = table_cols("category")
cols_listing = table_cols("productlisting")
cols_product = table_cols("product")
cols_pageraw = table_cols("pageraw")
cols_price = table_cols("pricesnapshot")
cols_review = table_cols("review")

with JSONL_PATH.open("r", encoding="utf-8") as f:
    for line_no, line in enumerate(f, start=1):
        line = line.strip()
        if not line:
            continue

        try:
            r = json.loads(line)
        except Exception as e:
            print(f"[WARN] JSON parse error at line {line_no}: {e}")
            continue

        t = r.get("type")

        # -----------------------------
        # PAGERAW (JSONL: type=pageraw)
        # -----------------------------
        if t == "pageraw":
            insert_filtered("pageraw", {
                "scraped_at": r.get("scraped_at"),
                "competitor_id": MAXIAXI_ID,
                "competitor_key": r.get("competitor_key"),
                "url": r.get("url"),
                "source_url": r.get("url"),
            }, or_ignore=True)
            continue

        # -----------------------------
        # PRODUCTLISTING (JSONL: type=productlisting)
        # -----------------------------
        if t == "productlisting":
            product_url = r.get("product_url")
            if not product_url:
                continue

            # CATEGORY (derived from category_name)
            category_id = None
            cat_name = r.get("category_name") or r.get("category")
            if cat_name and ("category_id" in cols_category):
                ck = stable_int_key(f"{MAXIAXI_ID}|{cat_name}")

                if ck not in category_id_by_key:
                    insert_filtered("category", {
                        "competitor_id": MAXIAXI_ID,
                        "name": cat_name,
                        "url": None,
                        "parent_category_id": None,
                        "category_key": ck,  # inserted only if col exists
                    }, or_ignore=True)

                    # fetch category_id (only if the columns exist)
                    if "category_key" in cols_category:
                        category_id_by_key[ck] = get_id(
                            "SELECT category_id FROM category WHERE category_key=?",
                            (ck,)
                        )
                    else:
                        category_id_by_key[ck] = get_id(
                            "SELECT category_id FROM category WHERE competitor_id=? AND name=?",
                            (MAXIAXI_ID, cat_name)
                        )

                category_id = category_id_by_key.get(ck)

            # Stable numeric listing_key (if your schema has it)
            lk = stable_int_key(product_url)

            # 1) INSERT PRODUCTLISTING FIRST (so we can get listing_id)
            insert_filtered("productlisting", {
                "competitor_id": MAXIAXI_ID,
                "category_id": category_id,
                "product_url": product_url,
                "title_on_page": r.get("product_name"),
                "image_url_src": r.get("image_url_on_pdp") or r.get("image_url"),
                "img_url_cdn": None,
                "gtin_ean_upc_on_page": r.get("ean"),
                "description_clean": r.get("description_clean"),
                "listing_key": lk,
                "sku": r.get("sku"),
                "ean": r.get("ean"),
            }, or_ignore=True)

            # 2) FETCH listing_id (REQUIRED for product table in your DB)
            listing_id = listing_id_by_url.get(product_url)
            if listing_id is None and ("listing_id" in cols_listing):
                if "listing_key" in cols_listing:
                    listing_id = get_id(
                        "SELECT listing_id FROM productlisting WHERE listing_key=?",
                        (lk,)
                    )
                else:
                    listing_id = get_id(
                        "SELECT listing_id FROM productlisting WHERE competitor_id=? AND product_url=?",
                        (MAXIAXI_ID, product_url)
                    )
                listing_id_by_url[product_url] = listing_id

            if listing_id is None and ("listing_id" in cols_product):
                print(f"[WARN] Could not resolve listing_id for url at line {line_no}: {product_url}")
                continue

            # 3) INSERT PRODUCT (your schema requires product.listing_id NOT NULL)
            pk = stable_int_key(product_url)
            insert_filtered("product", {
                "listing_id": listing_id,          # IMPORTANT (fixes NOT NULL constraint)
                "canonical_name": r.get("product_name"),
                "brand": r.get("brand"),
                "model": r.get("model"),
                "product_key": pk,
            }, or_ignore=True)

            # -----------------------------
            # PRICESNAPSHOT (only if fields present)
            # -----------------------------
            price_fields_present = any(
                k in r for k in [
                    "current_price", "base_price", "discount_amount",
                    "discount_percent", "currency", "price_text", "in_stock",
                    "stock_status_text"
                ]
            )

            if price_fields_present and listing_id is not None:
                insert_filtered("pricesnapshot", {
                    "listing_id": listing_id,
                    "scraped_at": r.get("scraped_at"),
                    "currency": r.get("currency"),
                    "current_price": r.get("current_price"),
                    "base_price": r.get("base_price"),
                    "discount_amount": r.get("discount_amount"),
                    "discount_percent": r.get("discount_percent"),
                    "price_text": r.get("price_text"),
                    "in_stock": r.get("in_stock"),
                    "stock_status_text": r.get("stock_status_text"),
                    "competitor_id": MAXIAXI_ID,
                }, or_ignore=False)

            # -----------------------------
            # REVIEW (aggregate; only if present)
            # -----------------------------
            if listing_id is not None and (r.get("rating_value") is not None or r.get("review_count") is not None):
                insert_filtered("review", {
                    "listing_id": listing_id,
                    "created_at": r.get("scraped_at"),
                    "rating_value": r.get("rating_value"),
                    "rating_scale": r.get("rating_scale"),
                    "review_count": r.get("review_count"),
                    "review_url": product_url,
                }, or_ignore=False)

            continue

        # Ignore other types (e.g., "run")

conn.commit()
print("Imported MaxiAxi JSONL:", JSONL_PATH.resolve())


In [None]:
def show(table, n=5):
    if not table_exists(table):
        print(f"\n=== {table} (missing table) ===")
        return
    df = pd.read_sql_query(f"SELECT * FROM {table} LIMIT {n}", conn)
    print(f"\n=== {table} ({len(df)} rows shown) ===")
    display(df)

for t in [
    "competitor","scraperun","category","product","productlisting",
    "pricesnapshot","review","productmatch","pagelink","page"
]:
    show(t)


In [None]:
MAXIAXI_ID = 3

def show_maxiaxi(table, n=5):
    if not table_exists(table):
        print(f"\n=== {table} (missing table) ===")
        return
    # Some tables do not have competitor_id directly; handle the common ones
    if has_col(table, "competitor_id"):
        df = pd.read_sql_query(
            f"SELECT * FROM {table} WHERE competitor_id = {MAXIAXI_ID} LIMIT {n}",
            conn
        )
    else:
        df = pd.read_sql_query(f"SELECT * FROM {table} LIMIT {n}", conn)

    print(f"\n=== {table} (MaxiAxi only where applicable) ===")
    display(df)

for t in ["category", "productlisting", "page"]:
    show_maxiaxi(t)

# pricesnapshot usually needs a JOIN via productlisting
if table_exists("pricesnapshot") and table_exists("productlisting"):
    df = pd.read_sql_query("""
        SELECT ps.*
        FROM pricesnapshot ps
        JOIN productlisting pl ON pl.listing_id = ps.listing_id
        WHERE pl.competitor_id = 3
        LIMIT 5
    """, conn)
    print("\n=== pricesnapshot (MaxiAxi only) ===")
    display(df)
