In [1]:
from urllib.parse import urljoin, urlencode, urlparse, parse_qsl, urlunparse
import pandas as pd
import re, time, requests, json
from typing import List, Dict, Tuple, Optional, Any
import warnings
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
import time
from urllib.parse import quote

# helpers

In [11]:
CHROME_HEADERS = { # macOS Chrome UA
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/129.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
    # "Connection": "keep-alive",
}

def http_request(url, base_headers = CHROME_HEADERS, 
                 timeout = (5, 15), retries: int = 2, sleep: float = 8,
                 method: str = 'GET', json_data: dict = None, data: Any = None):
    """
    Unified HTTP request function supporting both GET and POST.
    """

    # meta-charset sniffing for HTML/XML
    _META_CHARSET_RE = re.compile(
        br"""(?ix)
        (?:<meta[^>]+charset=["']?\s*([a-z0-9_\-]+)\s*["']?[^>]*>)|
        (?:<meta[^>]+http-equiv=["']?content-type["']?[^>]+content=["'][^"']*;\s*charset=([a-z0-9_\-]+)[^"']*["'][^>]*>)|
        (?:^<\?xml[^>]*encoding=["']\s*([a-z0-9_\-]+)\s*["'])
        """
    )
    def _sniff_charset(raw_head: bytes) -> Optional[str]:
        m = _META_CHARSET_RE.search(raw_head)
        if not m:
            return None
        enc = next((g for g in m.groups() if g), None)
        if not enc:
            return None
        enc = enc.decode("ascii", "ignore").lower()
        if enc in ("gbk", "cp936"):
            return "gb18030"
        if enc == "utf8":
            return "utf-8"
        return enc

    def _attempt(verify: bool) -> Tuple[str, Dict[str, str]]:
        def auto_parse(text):
            try:
                if text.lstrip("\ufeff \t\r\n").startswith(("{", "[")):
                    return json.loads(text)
                if "html" in text[:1000].lower():
                    return BeautifulSoup(text, "html.parser")
                    # return BeautifulSoup(text, "lxml")
                else:
                    return text
            except Exception as e:
                print(f'[ERROR] auto_parse (json or html) failed')

        # Prepare request kwargs
        req_kwargs = {
            'headers': base_headers,
            'timeout': timeout,
            'verify': verify
        }
        
        # Add POST data if provided
        if method.upper() == 'POST':
            if json_data is not None:
                req_kwargs['json'] = json_data
            elif data is not None:
                req_kwargs['data'] = data
        
        # Make request
        if method.upper() == 'POST':
            r = requests.post(url, **req_kwargs)
        else:
            r = requests.get(url, **req_kwargs)

        r.raise_for_status() # raises on 400–599 -> requests.HTTPError

        heads = {k.lower(): v for k, v in r.headers.items()}
        ct = heads.get("content-type", "")

        # JSON is UTF-8 by spec; don't overthink it unless server lies badly
        if "application/json" in ct:
            r.encoding = r.encoding or "utf-8"
            return auto_parse(r.text), heads

        # otherwise (HTML/...), choose best encoding BEFORE using .text
        head_bytes = r.content[:32768]
        sniff = _sniff_charset(head_bytes)
        if sniff:
            r.encoding = sniff
        else:
            if not r.encoding or r.encoding.lower() in ("iso-8859-1", "ascii"):
                if getattr(r, "apparent_encoding", None):
                    r.encoding = r.apparent_encoding
                else:
                    r.encoding = "utf-8"

            if r.encoding.lower().startswith("utf-8"):
                hb = head_bytes.lower()
                if b"charset=gb" in hb or b"charset = gb" in hb:
                    r.encoding = "gb18030"

        return auto_parse(r.text), heads

    for _ in range(max(1, retries)):
        try:
            return _attempt(verify=True)
        except requests.exceptions.SSLError as e:
            # try with SSL verification disabled (public pages only!)
            try:
                return _attempt(verify=False)
            except Exception as e:
                pass
        except Exception as e:
            pass
        # retry
        time.sleep(sleep)

    # raise last_err or RuntimeError(f'request failed for {url}')
    raise RuntimeError(f'request failed for {url}')


def _turn_page(url: str, page: int) -> str:
    if page <= 1:
        return url
    u = urlparse(url)
    q = dict(parse_qsl(u.query, keep_blank_values=True))
    q["page"] = str(page)
    new_q = urlencode(q)
    return urlunparse((u.scheme, u.netloc, u.path, u.params, new_q, u.fragment))

def get_num(text: str):
    try:
        m = re.compile(r"-?\d{1,3}(?:,\d{3})*(?:\.\d+)?|-?\d+(?:\.\d+)?").search(text)
        num = m.group(0).replace(",", "")
        return float(num)
    except Exception:
        return None
    
def _tokens(title: str):
    if not isinstance(title, str):
        return set()
    return set(re.compile(r"[a-z0-9]+").findall(title.lower()))

def _jaccard(a: set, b: set) -> float:
    if not a or not b:
        return 0.0
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union else 0.0

# scrapers

## iconic

In [4]:

def scrape_iconic(brand="misha-collection", threshold=0):
    start_url = f"https://www.theiconic.com.au/{brand}"

    rows = []
    page = 1

    while True:
        url = _turn_page(start_url, page)
        soup, _ = http_request(url)

        items = soup.select("a.product-details")
        if not items:
            break

        for a in items:

            price_final = get_num(a.select_one("span.price.final").get_text(" ", strip=True) 
                                       if a.select_one("span.price.final") else None)
            price_original = get_num(a.select_one("span.price.original").get_text(" ", strip=True) 
                                          if a.select_one("span.price.original") else None)
            # price_plain = get_num(a.select_one("span.price").get_text(" ", strip=True) 
            #                            if a.select_one("span.price") else None)
            
            if price_final and price_original:
                price_diff = price_original-price_final
                if price_diff>=threshold:

                    brand = (a.select_one("span.brand").get_text(strip=True)
                            if a.select_one("span.brand") else "").strip()
                    name = (a.select_one("span.name").get_text(strip=True)
                            if a.select_one("span.name") else "").strip()
                    title = (f"{brand} {name}").strip()
                    
                    href = a.get("href", "").strip()
                    link = urljoin("https://www.theiconic.com.au", href) if href else ""

                    # print(title, " ", price, " ", link)
                    rows.append({"title": title, "price": price_final, "was": price_original, "diff": price_diff, "link": link})

        page += 1

    df = pd.DataFrame(
        rows, columns=["title", "price", "was", "diff", "link"]
    ).sort_values("diff", ascending=False).reset_index(drop=True)

    return df

scrape_iconic("misha-collection")


Unnamed: 0,title,price,was,diff,link
0,MISHA Irina LS Dress,105.0,420.0,315.0,https://www.theiconic.com.au/irina-ls-dress-25...
1,MISHA The Kendall LS Jersey Dress,138.6,440.0,301.4,https://www.theiconic.com.au/the-kendall-ls-je...
2,MISHA Echo Knit Dress,126.0,420.0,294.0,https://www.theiconic.com.au/echo-knit-dress-2...
3,MISHA Aubree Cotton Sateen Midi Dress - ICONIC...,72.0,360.0,288.0,https://www.theiconic.com.au/aubree-cotton-sat...
4,MISHA The Horizon Satin Dress,177.1,460.0,282.9,https://www.theiconic.com.au/the-horizon-satin...
...,...,...,...,...,...
98,MISHA The 'Alusia' Dress,252.0,360.0,108.0,https://www.theiconic.com.au/the-alusia-dress-...
99,MISHA Nyra Mesh Gown,252.0,360.0,108.0,https://www.theiconic.com.au/nyra-mesh-gown-25...
100,MISHA Lilia Gown,252.0,360.0,108.0,https://www.theiconic.com.au/lilia-gown-256278...
101,MISHA Alusia Maxi Dress,238.0,340.0,102.0,https://www.theiconic.com.au/alusia-maxi-dress...


## VIKTORIA & WOODS

In [5]:

def scrape_vw(start_url="https://gqj2zz.a.searchspring.io/api/search/search.json?domain=https%3A%2F%2Fviktoriaandwoods.com.au%2Fcollections%2Fshop-sale&resultsFormat=native&redirectResponse=full&bgfilter.collection_handle=shop-sale&resultsPerPage=24&siteId=gqj2zz", threshold=0):

    rows = []
    page = 1

    while True:
        url = _turn_page(start_url, page)
        json, _ = http_request(url)

        items = json.get("results")
        if not items:
            break

        for i in items:
            # print(i.get("name"))

            price = float(i.get("price"))
            was = float(i.get("msrp") or 0)
            diff = was-price

            if diff>=threshold:

                brand = "Viktoria & Woods"
                name = (i.get("name") if i.get("name") else "").strip()
                title = (f"{brand} {name}").strip()

                href = i.get("handle")
                link = urljoin("https://viktoriaandwoods.com.au/products/", href) if href else ""
                
                rows.append({"title": title, "price": price, "was": was, "diff": diff, "link": link})

        page += 1

    df = pd.DataFrame(
        rows, columns=["title", "price", "was", "diff", "link"]
    ).sort_values("diff", ascending=False).reset_index(drop=True)    
    return df

print(scrape_vw().to_string(index=False, line_width=None))

                                           title  price    was  diff                                                                                        link
            Viktoria & Woods Camaro Leather Pant  499.0 1400.0 901.0                      https://viktoriaandwoods.com.au/products/camaro-leather-pant-chocolate
                  Viktoria & Woods Elitist Skirt  399.0 1100.0 701.0                                https://viktoriaandwoods.com.au/products/elitist-skirt-black
                Viktoria & Woods Fairytale Dress  299.0  850.0 551.0                              https://viktoriaandwoods.com.au/products/fairytale-dress-black
           Viktoria & Woods Opioid Leather Skirt  699.0 1200.0 501.0                       https://viktoriaandwoods.com.au/products/opioid-leather-skirt-sangria
             Viktoria & Woods Brotherhood Trench  249.0  750.0 501.0                    https://viktoriaandwoods.com.au/products/brotherhood-trench-smoke-grey-1
                 Viktoria & Woods 

## dj

In [35]:
def get_product_info(item):
    brand_elem = item.select_one("p.ProductCard_brand__SYBe7")
    brand = brand_elem.get_text(strip=True) if brand_elem else ""
    name_elem = item.select_one("h2.ProductCard_name__p_7X2")
    name = name_elem.get_text(strip=True) if name_elem else ""
    title = f"{brand} {name}".strip()

    link = item.select_one("div.yotpo-widget-instance")['data-yotpo-url']
    link = urljoin("https://www.davidjones.com", link)    
    
    # Extract product ID from link for special offers API
    product_id = None
    id_match = re.search(r'-(\d+)(?:\?|$)', link)
    product_id = id_match.group(1)
    
    # Price extraction from Price_root__y8UOm
    price_root = item.select_one("div.Price_root__y8UOm")
    price_plain = None
    price_now = None
    
    # Check the accessibility text for price info
    # Pattern: "Price is now $220.00, it was $443.00" or "Price $399.00"
    accessibility_text = price_root.select_one("span[style*='position:absolute']")
    if accessibility_text:
        text = accessibility_text.get_text(strip=True)
        # Check if it's a sale price
        if "it was" in text.lower():
            # Pattern: "Price is now $220.00, it was $443.00"
            now_match = re.search(r'now\s+\$([0-9,]+\.?\d*)', text, re.IGNORECASE)
            was_match = re.search(r'was\s+\$([0-9,]+\.?\d*)', text, re.IGNORECASE)
            if now_match:
                price_now = float(now_match.group(1).replace(',', ''))
            if was_match:
                price_plain = float(was_match.group(1).replace(',', ''))
        else:
            # Pattern: "Price $399.00"
            price_match = re.search(r'Price\s+\$([0-9,]+\.?\d*)', text, re.IGNORECASE)
            if price_match:
                price_plain = float(price_match.group(1).replace(',', ''))
    
    # Determine final price and was price
    candidates = [p for p in (price_plain, price_now) if p is not None]
    price = min(candidates) if candidates else None
    was = max(candidates) if candidates else None
    
    return title, price, was, link, product_id

def apply_offer_discount(price_plain, price_now, offer_text):
    if not offer_text:
        return None
    
    discount = get_num(offer_text)
    if not discount:
        return None
    
    price_offer = None
    
    if offer_text.startswith("EXTRA") and "%" in offer_text:
        if price_now:
            price_offer = round(price_now * (1 - discount / 100), 2)
        elif price_plain:
            price_offer = round(price_plain * (1 - discount / 100), 2)
    elif offer_text.startswith("SAVE") and "%" in offer_text:
        if price_plain:
            price_offer = round(price_plain * (1 - discount / 100), 2)
    elif offer_text.startswith("SAVE") and "$" in offer_text:
        if price_plain:
            price_offer = round(price_plain - discount, 2)
    elif offer_text.startswith("BUY"):
        m_buy = re.search(r'\bBUY\s+(\d+)\s+FOR\s*\$?\s*([0-9]+(?:\.[0-9]{1,2})?)\b', offer_text)
        if m_buy:
            qty = int(m_buy.group(1))
            total = float(m_buy.group(2))
            if qty > 0:
                price_offer = round(total / qty, 2)
    elif "GIFT CARD" in offer_text:
        if price_plain >= 600:
            price_offer = price_plain-150
        elif price_plain >= 300:
            price_offer = price_plain-50
        elif price_plain >= 150:
            price_offer = price_plain-20
    
    return price_offer

def scrape_davidjones(brand: str = "misha"):
    start_url = f"https://www.davidjones.com/brand/{brand}"

    rows = []
    page = 1

    while True:
        url = _turn_page(start_url, page)
        try:
            soup, _ = http_request(url)
        except Exception as e:
            break
        
        items = soup.select("ul#products-grid > li")
        if not items:
            break
        
        # Collect product info and IDs
        products_data = []
        product_ids = []
        
        for item in items:
            title, price, was, link, product_id = get_product_info(item)
            if title:
                products_data.append({
                    "title": title,
                    "price": price,
                    "was": was,
                    "link": link,
                    "product_id": product_id
                })
                if product_id:
                    product_ids.append(product_id)
        
        # Fetch special offers using http_request
        offers_map = {}
        try:
            offers_url = "https://www.davidjones.com/routes/special-offers"
            offers_data, _ = http_request(offers_url, method='POST', json_data={"ids": product_ids})
            
            for offer in offers_data:
                offer_id = offer.get("id")
                short_desc = offer.get("shortDescription", "")
                if offer_id and short_desc:
                    offers_map[offer_id] = short_desc
        except Exception as e:
            print(f"[WARN] Failed to fetch special offers: {e}")
        
        # Apply offers and finalize prices
        for product in products_data:
            product_id = product.get("product_id")
            offer_text = offers_map.get(product_id, "") if product_id else ""
            
            if offer_text:
                # Apply offer discount
                price_offer = apply_offer_discount(product["was"] or product["price"], 
                                                   product["price"], 
                                                   offer_text)
                if price_offer:
                    # Update price with offer discount
                    candidates = [p for p in (product["was"] or product["price"], product["price"], price_offer) if p is not None]
                    product["price"] = min(candidates)
                    product["was"] = max(candidates)
            
            # Remove product_id from final output
            rows.append({
                "title": product["title"],
                "price": product["price"],
                "was": product["was"],
                "link": product["link"]
            })

        page += 1

    df = pd.DataFrame(rows, columns=["title", "price", "was", "link"])
    return df

scrape_davidjones("misha")

Unnamed: 0,title,price,was,link
0,Misha THE 'SISAL' MESH DRESS,266.0,380.0,https://www.davidjones.com/product/misha-the-s...
1,Misha THE 'LUSI' CREPE DRESS,266.0,380.0,https://www.davidjones.com/product/misha-the-l...
2,Misha THE 'CAMERON' SATIN DRESS,378.0,540.0,https://www.davidjones.com/product/misha-the-c...
3,Misha THE 'GISELE' JERSEY DRESS,266.0,380.0,https://www.davidjones.com/product/misha-the-g...
4,Misha GABRIETTE JERSEY GOWN,249.0,360.0,https://www.davidjones.com/product/misha-gabri...
...,...,...,...,...
80,Misha Twisted One Shoulder Midi Dress Vermilli...,125.3,279.0,https://www.davidjones.com/product/misha-twist...
81,Misha MONIKA TOP,83.3,200.0,https://www.davidjones.com/product/misha-monik...
82,Misha TILLIE MESH MINI DRESS,125.3,300.0,https://www.davidjones.com/product/misha-tilli...
83,Misha ROSY SATIN TOP,90.3,179.0,https://www.davidjones.com/product/misha-rosy-...


# compare

In [22]:


def compare_price(df_dj: pd.DataFrame, df_pm: pd.DataFrame, sim_thresh: float = 0.8) -> pd.DataFrame:
    dj = df_dj.copy()
    pm = df_pm.copy()

    for df in (dj, pm):
        # Standardize expected columns presence
        if "title" not in df.columns or "price" not in df.columns or "link" not in df.columns:
            raise ValueError("Input DataFrames must have columns: title, price, link")
        df["__tokens"] = df["title"].map(_tokens)

    # Greedy best-match per DJ item (no pm reuse)
    used_pm_idx = set()
    matches = []

    for _, row_dj in dj.iterrows():
        best = (-1.0, None)  # (sim, pm_idx)
        tok_dj = row_dj["__tokens"]
        if not tok_dj:
            continue

        for i_pm, row_pm in pm.iterrows():
            if i_pm in used_pm_idx:
                continue
            sim = _jaccard(tok_dj, row_pm["__tokens"])
            if sim > best[0]:
                best = (sim, i_pm)

        sim, i_pm_best = best
        if i_pm_best is not None and sim >= sim_thresh:
            used_pm_idx.add(i_pm_best)
            row_pm = pm.loc[i_pm_best]

            price_dj = row_dj["price"]
            price_pm = row_pm["price"]
            # Skip if price missing or non-numeric
            try:
                price_dj = float(price_dj)
                price_pm = float(price_pm)
            except Exception:
                continue

            matches.append({
                "title_dj": row_dj["title"],
                "title_pm": row_pm["title"],
                "price_diff": price_dj - price_pm,
                "price_dj": price_dj,
                "price_pm": price_pm,
                "link_dj": row_dj["link"],
                "link_pm": row_pm["link"],
            })

    compare_out = pd.DataFrame(
        matches,
        columns=["title_dj", "title_pm", "price_diff", "price_dj", "price_pm", "link_dj", "link_pm"]
    ).sort_values("price_diff", ascending=False).reset_index(drop=True)

    return compare_out

## run by brand

In [25]:
# misha
compare_out = compare_price(scrape_davidjones("misha"), scrape_iconic("misha-collection"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                    title_dj                   title_pm  price_diff  price_dj  price_pm                                                                                             link_dj                                                        link_pm
     Misha ABELIA MESH DRESS    MISHA Abelia Mesh Dress       102.6     285.0     182.4    https://www.davidjones.com/product/misha-abelia-mesh-dress-27495399?navId=953674&colorId=2903330    https://www.theiconic.com.au/abelia-mesh-dress-2504686.html
    Misha MAYBEL JERSEY GOWN   MISHA Maybel Jersey Gown        72.0     270.0     198.0   https://www.davidjones.com/product/misha-maybel-jersey-gown-27373212?navId=953674&colorId=2878311   https://www.theiconic.com.au/maybel-jersey-gown-2504658.html
Misha THE 'SISAL' MESH DRESS MISHA The Sisal Mesh Dress        57.0     285.0     228.0 https://www.davidjones.com/product/misha-the-sisal-mesh-dress-27940871?navId=953674&colorId=2994526 https://www.theiconic.com.au/the-sisal-mesh-dress-2562774.h

In [277]:
# Lioness
compare_out = compare_price(scrape_davidjones("lioness"), scrape_iconic("lioness"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                       title_dj                        title_pm  price_diff  price_dj  price_pm                                                                                link_dj                                                           link_pm
    Lioness MIDNIGHT MAXI DRESS     Lioness Midnight Maxi Dress       59.00      89.0     30.00     https://www.davidjones.com/product/lioness-midnight-maxi-dress-27611495?nav=958119     https://www.theiconic.com.au/midnight-maxi-dress-2591496.html
      Lioness REBELS MINI DRESS       Lioness Rebels Mini Dress       43.45      55.3     11.85       https://www.davidjones.com/product/lioness-rebels-mini-dress-27551025?nav=958119       https://www.theiconic.com.au/rebels-mini-dress-2564638.html
           Lioness ENDLESS COAT            Lioness Endless Coat       36.45      99.0     62.55            https://www.davidjones.com/product/lioness-endless-coat-27467024?nav=958119            https://www.theiconic.com.au/endless-coat-2628204.html
Lion

In [278]:
# Calvin Klein
compare_out = compare_price(scrape_davidjones("calvin-klein"), scrape_iconic("calvin-klein"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

Empty DataFrame
Columns: [title_dj, title_pm, price_diff, price_dj, price_pm, link_dj, link_pm]
Index: []


In [279]:
# Assembly Label
compare_out = compare_price(scrape_davidjones("assembly-label"), scrape_iconic("assembly"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                                title_dj                                 title_pm  price_diff  price_dj  price_pm                                                                                         link_dj                                                             link_pm
       ASSEMBLY LABEL Osanna Knit Jumper        Assembly Label Osanna Knit Jumper       50.00     140.0     90.00        https://www.davidjones.com/product/assembly-label-osanna-knit-jumper-27698827?nav=943876        https://www.theiconic.com.au/osanna-knit-jumper-2582738.html
       ASSEMBLY LABEL Ryder Fleece Sweat        Assembly Label Ryder Fleece Sweat       47.50     100.0     52.50        https://www.davidjones.com/product/assembly-label-ryder-fleece-sweat-27482640?nav=943876        https://www.theiconic.com.au/ryder-fleece-sweat-2515352.html
           ASSEMBLY LABEL Blair Car Coat            Assembly Label Blair Car Coat       46.25     140.0     93.75            https://www.davidjones.com/product/assemb

In [None]:
# CAMILLA AND MARC
compare_out = compare_price(scrape_davidjones("camilla-and-marc"), scrape_iconic("camilla-marc"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                                       title_dj                                        title_pm  price_diff  price_dj  price_pm                                                                                               link_dj                                                                  link_pm
Camilla and Marc Ottilie 2.0 Short Sleeve Shirt CAMILLA AND MARC Ottilie 2.0 Short Sleeve Shirt         0.0     350.0     350.0 https://www.davidjones.com/product/camilla-and-marc-ottilie-20-short-sleeve-shirt-27711441?nav=884776 https://www.theiconic.com.au/ottilie-2-0-short-sleeve-shirt-2665932.html
                  Camilla and Marc Lolani Dress                   CAMILLA AND MARC Lolani Dress         0.0     550.0     550.0                  https://www.davidjones.com/product/camilla-and-marc-lolani-dress-27711447?nav=884776                   https://www.theiconic.com.au/lolani-dress-2665945.html
                    Camilla and Marc Astor Vest                     CAMILLA AND MARC Astor 

In [281]:
# Country Road
compare_out = compare_price(scrape_davidjones("country-road"), scrape_iconic("country-road"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                                                         title_dj                                                          title_pm  price_diff  price_dj  price_pm                                                                                                                  link_dj                                                                                        link_pm
         Country Road Organically Grown Cotton T-Shirt Logo Dress          Country Road Organically Grown Cotton T-shirt Logo Dress        10.0     69.95     59.95          https://www.davidjones.com/product/country-road-organically-grown-cotton-t-shirt-logo-dress-27655169?nav=883808          https://www.theiconic.com.au/organically-grown-cotton-t-shirt-logo-dress-2675340.html
Country Road Organically Grown Cotton Country Road Pocket T-Shirt Country Road Organically Grown Cotton Country Road Pocket T-shirt         5.0     39.95     34.95 https://www.davidjones.com/product/country-road-organically-grown-cotton-cou

In [282]:
# P.E Nation
compare_out = compare_price(scrape_davidjones("pe-nation"), scrape_iconic("pe-nation"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                         title_dj                          title_pm  price_diff  price_dj  price_pm                                                                                 link_dj                                                          link_pm
 P.E Nation Adventure Zip Through  P.E Nation Adventure Zip Through       63.60     159.0     95.40  https://www.davidjones.com/product/pe-nation-adventure-zip-through-27944155?nav=895781  https://www.theiconic.com.au/adventure-zip-through-2566515.html
        P.E Nation Shelter Jacket         P.E Nation Shelter Jacket       62.25     249.0    186.75         https://www.davidjones.com/product/pe-nation-shelter-jacket-27781000?nav=895781         https://www.theiconic.com.au/shelter-jacket-2596655.html
            P.E Nation Vita Sweat             P.E Nation Vita Sweat       57.60     129.0     71.40             https://www.davidjones.com/product/pe-nation-vita-sweat-27716426?nav=895781             https://www.theiconic.com.au/vita-sweat-2

In [283]:
# Marc Jacobs
compare_out = compare_price(scrape_davidjones("marc-jacobs"), scrape_iconic("the-marc-jacobs"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                                title_dj                                 title_pm  price_diff  price_dj  price_pm                                                                                         link_dj                                                                link_pm
          Marc Jacobs The Large Tote Bag           Marc Jacobs The Large Tote Bag         NaN    348.75       NaN           https://www.davidjones.com/product/marc-jacobs-the-large-tote-bag-23533443?nav=949775           https://www.theiconic.com.au/the-large-tote-bag-2662966.html
 Marc Jacobs The Leather Medium Tote Bag  Marc Jacobs The Leather Medium Tote Bag         NaN    693.75       NaN  https://www.davidjones.com/product/marc-jacobs-the-leather-medium-tote-bag-24460791?nav=949775  https://www.theiconic.com.au/the-leather-medium-tote-bag-1537602.html
Marc Jacobs The Jacquard Medium Tote Bag Marc Jacobs The Jacquard Medium Tote Bag         NaN    502.50       NaN https://www.davidjones.com/product/marc-jac

In [None]:
# Tommy Hilfiger
compare_out = compare_price(scrape_davidjones("tommy-hilfiger"), scrape_iconic("tommy-hilfiger"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

In [None]:
# GUESS
compare_out = compare_price(scrape_davidjones("guess"), scrape_iconic("guess"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

Empty DataFrame
Columns: [title_dj, title_pm, price_diff, price_dj, price_pm, link_dj, link_pm]
Index: []


# search by sales

In [23]:
def compare_search(pm, sim_thresh=0.9):
    matches = []
    SEARCH_BASE = "https://www.davidjones.com/search?q="

    pms = pm.to_dict("records")
    total = len(pms)
    for idx, i in enumerate(pms, 1):
        print(f"*{idx}/{total}", flush=True)
        title_pm = i["title"]
        search_url = f"{SEARCH_BASE}{quote(title_pm)}"

        # Retry until we get SSR variant
        try:
            soup, _ = http_request(search_url)
        except Exception as e:
            print("[ERR] ", e)
            print("      title: ", title_pm)
            continue
        
        # NEW: Select items from ul#products-grid > li
        items = soup.select("ul#products-grid > li")
        if not items:
            continue

        item = items[0]
        title, price, was, link, product_id = get_product_info(item)
        
        try:
            offers_url = "https://www.davidjones.com/routes/special-offers"
            offers_data, _ = http_request(offers_url, method='POST', json_data={"ids": [product_id]})
            
            offer_text = ""
            for offer in offers_data:
                if offer.get("id") == product_id:
                    offer_text = offer.get("shortDescription", "")
                    break
            
            if offer_text:
                # Apply offer discount
                price_offer = apply_offer_discount(was or price, price, offer_text)
                if price_offer:
                    candidates = [p for p in (was or price, price, price_offer) if p is not None]
                    price = min(candidates)
                    was = max(candidates)
        except Exception as e:
            print(f"[WARN] Failed to fetch special offer for {product_id}: {e}")

        if was == i["was"] and _jaccard(_tokens(title), _tokens(title_pm)) >= sim_thresh:

            price_pm = i["price"]
            price_diff = price - price_pm
            link_pm = i["link"]

            if price_diff > 100:
                matches.append({
                    "title_dj": title,
                    "title_pm": title_pm,
                    "price_diff": price_diff,
                    "price_dj": price,
                    "price_pm": price_pm,
                    "link_dj": link,
                    "link_pm": link_pm,
                })
            if price_diff > 250:
                print(f"[***]\n{title}\n{price} - {price_pm} = {price_diff}\ndj: {link}\npm: {link_pm}")
            elif price_diff > 200:
                print(f"[**]\n{title}\n{price} - {price_pm} = {price_diff}\ndj: {link}\npm: {link_pm}")
            elif price_diff > 150:
                print(f"[*]\n{title}\n{price} - {price_pm} = {price_diff}\ndj: {link}\npm: {link_pm}")

    compare_out = pd.DataFrame(
        matches,
        columns=["title_dj", "title_pm", "price_diff", "price_dj", "price_pm", "link_dj", "link_pm"]
    ).sort_values("price_diff", ascending=False).reset_index(drop=True)

    return compare_out

In [19]:
ic = scrape_iconic("womens-sale", threshold=200)
print(ic.to_string(index=False, line_width=None))

                                                                   title  price     was   diff                                                                                                  link
                                           Solace London Neva Mini Dress 161.00 1150.00 989.00                                             https://www.theiconic.com.au/neva-mini-dress-2230375.html
                Proenza Schouler White Label Lucia Dress In Souffle Knit 258.30 1230.00 971.70                                 https://www.theiconic.com.au/lucia-dress-in-souffle-knit-2436880.html
              Proenza Schouler White Label Lato Wrap Dress In Linen Slub 311.50 1265.00 953.50                               https://www.theiconic.com.au/lato-wrap-dress-in-linen-slub-2511872.html
                                           Perfect Moment Helen Ski Suit 864.00 1800.00 936.00                                              https://www.theiconic.com.au/helen-ski-suit-2354830.html
               

In [None]:
out = compare_search(ic)
print(out.head(10).to_string(index=False, line_width=None))

*1/1722
*2/1722
*3/1722
*4/1722
*5/1722
*6/1722
*7/1722
*8/1722
*9/1722
*10/1722
*11/1722
*12/1722
*13/1722
*14/1722
*15/1722
*16/1722
*17/1722
*18/1722
*19/1722
*20/1722
*21/1722
*22/1722
*23/1722
*24/1722
*25/1722
*26/1722
*27/1722
*28/1722
*29/1722
*30/1722
*31/1722
*32/1722
[***]
Rachel Gilbert CAMILLE GOWN
1295.0 - 634.2 = 660.8
dj: https://www.davidjones.com/product/rachel-gilbert-camille-gown-27753352?colorId=2955513
pm: https://www.theiconic.com.au/camille-gown-2635576.html
*33/1722
*34/1722
*35/1722
*36/1722
*37/1722
*38/1722
*39/1722
*40/1722
*41/1722
*42/1722
*43/1722
*44/1722
[**]
Camilla and Marc Sullivan Blazer
569.0 - 342.0 = 227.0
dj: https://www.davidjones.com/product/camilla-and-marc-sullivan-blazer-27710787?colorId=2947692
pm: https://www.theiconic.com.au/sullivan-blazer-2588489.html
*45/1722


In [None]:
vw = scrape_vw(threshold=200)
out = compare_search(vw)
print(out.head(10).to_string(index=False, line_width=None))

*1/120
*2/120
*3/120
*4/120
*5/120
*6/120
*7/120
*8/120
*9/120
*10/120
*11/120
*12/120
*13/120
*14/120
*15/120
*16/120
*17/120
*18/120
*19/120
*20/120
*21/120
*22/120
*23/120
*24/120
*25/120
*26/120
*27/120
*28/120
*29/120
*30/120
*31/120
*32/120
*33/120
*34/120
*35/120
*36/120
*37/120
*38/120
*39/120
*40/120
*41/120
*42/120
*43/120
*44/120
*45/120
*46/120
*47/120
*48/120
*49/120
*50/120
*51/120
*52/120
*53/120
*54/120
*55/120
*56/120
*57/120
*58/120
*59/120
*60/120
*61/120
*62/120
*63/120
*64/120
*65/120
*66/120
*67/120
*68/120
*69/120
*70/120
*71/120
*72/120
*73/120
*74/120
*75/120
*76/120
*77/120
*78/120
*79/120
*80/120
*81/120
*82/120
*83/120
*84/120
*85/120
*86/120
*87/120
*88/120
*89/120
*90/120
*91/120
*92/120
*93/120
*94/120
*95/120
*96/120
*97/120
*98/120
*99/120
*100/120
*101/120
*102/120
*103/120
*104/120
*105/120
*106/120
*107/120
*108/120
*109/120
*110/120
*111/120
*112/120
*113/120
*114/120
*115/120
*116/120
*117/120
*118/120
*119/120
*120/120
Empty DataFrame
Columns: [ti