In [1]:

from urllib.parse import urljoin, urlencode, urlparse, parse_qsl, urlunparse
import pandas as pd
import re, time, requests, json
from typing import List, Dict, Tuple, Optional, Any
import warnings
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
import time
from urllib.parse import quote_plus


# http

In [2]:
CHROME_HEADERS = { # macOS Chrome UA
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/129.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
    # "Connection": "keep-alive",
}

def http_request(url, base_headers = CHROME_HEADERS, 
                 timeout = (5, 15), retries: int = 2, sleep: float = 8):
    """
    """

    # meta-charset sniffing for HTML/XML
    _META_CHARSET_RE = re.compile(
        br"""(?ix)
        (?:<meta[^>]+charset=["']?\s*([a-z0-9_\-]+)\s*["']?[^>]*>)|
        (?:<meta[^>]+http-equiv=["']?content-type["']?[^>]+content=["'][^"']*;\s*charset=([a-z0-9_\-]+)[^"']*["'][^>]*>)|
        (?:^<\?xml[^>]*encoding=["']\s*([a-z0-9_\-]+)\s*["'])
        """
    )
    def _sniff_charset(raw_head: bytes) -> Optional[str]:
        m = _META_CHARSET_RE.search(raw_head)
        if not m:
            return None
        enc = next((g for g in m.groups() if g), None)
        if not enc:
            return None
        enc = enc.decode("ascii", "ignore").lower()
        if enc in ("gbk", "cp936"):
            return "gb18030"
        if enc == "utf8":
            return "utf-8"
        return enc

    def _attempt(verify: bool) -> Tuple[str, Dict[str, str]]:
        def auto_parse(text):
            try:
                if text.lstrip("\ufeff \t\r\n").startswith(("{", "[")):
                    return json.loads(text)
                if "html" in text[:1000].lower():
                    return BeautifulSoup(text, "html.parser")
                    # return BeautifulSoup(text, "lxml")
                else:
                    return text
            except Exception as e:
                print(f'[ERROR] auto_parse (json or html) failed')


        r = requests.get(url, headers=base_headers, timeout=timeout, verify=verify)

        r.raise_for_status() # raises on 400–599 -> requests.HTTPError

        heads = {k.lower(): v for k, v in r.headers.items()}
        ct = heads.get("content-type", "")

        # JSON is UTF-8 by spec; don't overthink it unless server lies badly
        if "application/json" in ct:
            r.encoding = r.encoding or "utf-8"
            return auto_parse(r.text), heads

        # otherwise (HTML/...), choose best encoding BEFORE using .text
        head_bytes = r.content[:32768]
        sniff = _sniff_charset(head_bytes)
        if sniff:
            r.encoding = sniff
        else:
            if not r.encoding or r.encoding.lower() in ("iso-8859-1", "ascii"):
                if getattr(r, "apparent_encoding", None):
                    r.encoding = r.apparent_encoding
                else:
                    r.encoding = "utf-8"

            if r.encoding.lower().startswith("utf-8"):
                hb = head_bytes.lower()
                if b"charset=gb" in hb or b"charset = gb" in hb:
                    r.encoding = "gb18030"

        return auto_parse(r.text), heads

    for _ in range(max(1, retries)):
        try:
            return _attempt(verify=True)
        except requests.exceptions.SSLError as e:
            # try with SSL verification disabled (public pages only!)
            try:
                return _attempt(verify=False)
            except Exception as e:
                pass
        except Exception as e:
            pass
        # retry
        time.sleep(sleep)

    # raise last_err or RuntimeError(f'request failed for {url}')
    raise RuntimeError(f'request failed for {url}')

# scrapers

## iconic

In [384]:

def get_num(text: str):
    try:
        m = re.compile(r"-?\d{1,3}(?:,\d{3})*(?:\.\d+)?|-?\d+(?:\.\d+)?").search(text)
        num = m.group(0).replace(",", "")
        return float(num)
    except Exception:
        return None

def _turn_page(url: str, page: int) -> str:
    if page <= 1:
        return url
    u = urlparse(url)
    q = dict(parse_qsl(u.query, keep_blank_values=True))
    q["page"] = str(page)
    new_q = urlencode(q)
    return urlunparse((u.scheme, u.netloc, u.path, u.params, new_q, u.fragment))

def scrape_iconic(brand="misha-collection", threshold=0):
    start_url = f"https://www.theiconic.com.au/{brand}"

    rows = []
    page = 1

    while True:
        url = _turn_page(start_url, page)
        soup, _ = http_request(url)

        items = soup.select("a.product-details")
        if not items:
            break

        for a in items:

            price_final = get_num(a.select_one("span.price.final").get_text(" ", strip=True) 
                                       if a.select_one("span.price.final") else None)
            price_original = get_num(a.select_one("span.price.original").get_text(" ", strip=True) 
                                          if a.select_one("span.price.original") else None)
            # price_plain = get_num(a.select_one("span.price").get_text(" ", strip=True) 
            #                            if a.select_one("span.price") else None)
            
            if price_final and price_original:
                price_diff = price_original-price_final
                if price_diff>=threshold:

                    brand = (a.select_one("span.brand").get_text(strip=True)
                            if a.select_one("span.brand") else "").strip()
                    name = (a.select_one("span.name").get_text(strip=True)
                            if a.select_one("span.name") else "").strip()
                    title = (f"{brand} {name}").strip()
                    
                    href = a.get("href", "").strip()
                    link = urljoin("https://www.theiconic.com.au", href) if href else ""

                    # print(title, " ", price, " ", link)
                    rows.append({"title": title, "price": price_final, "was": price_original, "diff": price_diff, "link": link})

        page += 1

    df = pd.DataFrame(
        rows, columns=["title", "price", "was", "diff", "link"]
    ).sort_values("diff", ascending=False).reset_index(drop=True)

    return df

scrape_iconic("misha-collection")


Unnamed: 0,title,price,was,diff,link
0,MISHA Aubree Cotton Sateen Midi Dress - ICONIC...,72.0,360.0,288.0,https://www.theiconic.com.au/aubree-cotton-sat...
1,MISHA The Kendall LS Jersey Dress,158.4,440.0,281.6,https://www.theiconic.com.au/the-kendall-ls-je...
2,MISHA Elania Satin Midi Dress,67.8,339.0,271.2,https://www.theiconic.com.au/elania-satin-midi...
3,MISHA Emmie Backless Midi Dress,67.8,339.0,271.2,https://www.theiconic.com.au/emmie-backless-mi...
4,MISHA Kimora Satin & Lace Maxi Dress - ICONIC ...,114.0,380.0,266.0,https://www.theiconic.com.au/kimora-satin-lace...
...,...,...,...,...,...
87,MISHA Aspen Suiting Dress,288.0,360.0,72.0,https://www.theiconic.com.au/aspen-suiting-dre...
88,MISHA Marcie Suiting Mini Dress,272.0,340.0,68.0,https://www.theiconic.com.au/marcie-suiting-mi...
89,MISHA Chroma Two-Tone Dress,272.0,340.0,68.0,https://www.theiconic.com.au/chroma-two-tone-d...
90,MISHA Caspian Corset Top,208.0,260.0,52.0,https://www.theiconic.com.au/caspian-corset-to...


## VIKTORIA & WOODS

In [385]:

def scrape_vw(start_url="https://gqj2zz.a.searchspring.io/api/search/search.json?domain=https%3A%2F%2Fviktoriaandwoods.com.au%2Fcollections%2Fshop-sale&resultsFormat=native&redirectResponse=full&bgfilter.collection_handle=shop-sale&resultsPerPage=24&siteId=gqj2zz", threshold=0):

    rows = []
    page = 1

    while True:
        url = _turn_page(start_url, page)
        json, _ = http_request(url)

        items = json.get("results")
        if not items:
            break

        for i in items:
            # print(i.get("name"))

            price = float(i.get("price"))
            was = float(i.get("msrp") or 0)
            diff = was-price

            if diff>=threshold:

                brand = "Viktoria & Woods"
                name = (i.get("name") if i.get("name") else "").strip()
                title = (f"{brand} {name}").strip()

                href = i.get("handle")
                link = urljoin("https://viktoriaandwoods.com.au/products/", href) if href else ""
                
                rows.append({"title": title, "price": price, "was": was, "diff": diff, "link": link})

        page += 1

    df = pd.DataFrame(
        rows, columns=["title", "price", "was", "diff", "link"]
    ).sort_values("diff", ascending=False).reset_index(drop=True)    
    return df

print(scrape_vw().to_string(index=False, line_width=None))

                                          title  price    was  diff                                                                                        link
           Viktoria & Woods Camaro Leather Pant  499.0 1400.0 901.0                      https://viktoriaandwoods.com.au/products/camaro-leather-pant-chocolate
                 Viktoria & Woods Elitist Skirt  399.0 1100.0 701.0                                https://viktoriaandwoods.com.au/products/elitist-skirt-black
          Viktoria & Woods Opioid Leather Skirt  699.0 1200.0 501.0                       https://viktoriaandwoods.com.au/products/opioid-leather-skirt-sangria
            Viktoria & Woods Brotherhood Trench  249.0  750.0 501.0                    https://viktoriaandwoods.com.au/products/brotherhood-trench-smoke-grey-1
               Viktoria & Woods Impulsive Dress  299.0  790.0 491.0                              https://viktoriaandwoods.com.au/products/impulsive-dress-black
                Viktoria & Woods America

## dj

In [None]:

def _turn_page_dj(base_url: str, size: int, offset: int) -> str:
    u = urlparse(base_url)
    q = dict(parse_qsl(u.query, keep_blank_values=True))
    q["size"] = str(size)
    q["offset"] = str(offset)
    return urlunparse((u.scheme, u.netloc, u.path, u.params, urlencode(q), u.fragment))

def _is_ssr_variant(soup, heads):
    # Easiest & cheapest: header tells you the route
    xmp = (heads or {}).get("x-matched-path", "")
    if xmp == "/api/proxy":
        return True
    if xmp.startswith("/[locale]/"):
        return False
    # HTML heuristics: SSR variant has products markup up-front
    if isinstance(soup, BeautifulSoup):
        if soup.select_one("div.item-detail"):
            return True
        # Next shell usually has Next.js markers but no .item-detail yet
        if soup.select_one('script#__NEXT_DATA__'):
            return False
    return False

def get_product_info(item):
    brand = (item.select_one("div.item-brand").get_text(strip=True)
            if item.select_one("div.item-brand") else "").strip()
    a = item.select_one("a[href]")
    name = a.get_text(strip=True) if a else ""
    title = (f"{brand} {name}").strip()
    link = a.get("href", "").strip() if a else ""

    price_plain = get_num(item.select_one("p.price").get_text(" ", strip=True) 
                                if item.select_one("p.price") else None)
    price_now = get_num(item.select_one("p.price.now").get_text(" ", strip=True) 
                                if item.select_one("p.price.now") else None)
    
    price_offer = None
    offer = item.select_one("p.offer").get_text(" ", strip=True) if item.select_one("p.offer") else None
    if offer:
        # print(offer)
        discount = get_num(offer)
        # print(discount)
        if offer.startswith("EXTRA") and "%" in offer:
            if price_now:
                price_offer = round(price_now * (1-discount/100), 2)
            elif price_plain:
                price_offer = round(price_plain * (1-discount/100), 2)
        elif offer.startswith("SAVE") and "%" in offer:
            if price_plain:
                price_offer = round(price_plain * (1-discount/100), 2)
        # 2) NEW: SAVE $n  (e.g., "SAVE $50" -> price_plain - 50)
        elif offer.startswith("SAVE") and "$" in offer:
            price_offer = round(price_plain - discount, 2)
        # 3) NEW: BUY x FOR $y  (e.g., "BUY 2 FOR $99" -> 99/2)
        elif offer.startswith("BUY"):
            m_buy = re.search(r'\bBUY\s+(\d+)\s+FOR\s*\$?\s*([0-9]+(?:\.[0-9]{1,2})?)\b', offer)
            if m_buy:
                qty = int(m_buy.group(1))
                total = float(m_buy.group(2))
                if qty > 0:
                    price_offer = round(total / qty, 2)

    candidates = [p for p in (price_plain, price_now, price_offer) if p is not None]
    price = min(candidates) if candidates else None
    was = max(candidates) if candidates else None

    return title, price, was, link

def scrape_davidjones(brand: str = "misha"):
    start_url = f"https://www.davidjones.com/brand/{brand}?src=fh&size=90&offset=0"

    rows = []
    page_size = 90

    for i in range(99):
        offset = i * page_size
        url = _turn_page_dj(start_url, page_size, offset)
        # print(url)
        while(True):
            soup, heads = http_request(url)
            if _is_ssr_variant(soup, heads):
                break
            time.sleep(1)

        items = soup.select("div.item-detail")
        if not items:
            break
        
        for item in items:
            # print(item)
            title, price, was, link = get_product_info(item)

            if title:
                # print(title, " ", price, " ", link)
                rows.append({"title": title, "price": price, "was": was, "link": link})

    df = pd.DataFrame(rows, columns=["title", "price", "was", "link"])
    return df

scrape_davidjones("misha")

# compare

In [None]:
def _tokens(title: str):
    if not isinstance(title, str):
        return set()
    return set(re.compile(r"[a-z0-9]+").findall(title.lower()))

def _jaccard(a: set, b: set) -> float:
    if not a or not b:
        return 0.0
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union else 0.0

def compare_price(df_dj: pd.DataFrame, df_pm: pd.DataFrame, sim_thresh: float = 0.8) -> pd.DataFrame:
    dj = df_dj.copy()
    pm = df_pm.copy()

    for df in (dj, pm):
        # Standardize expected columns presence
        if "title" not in df.columns or "price" not in df.columns or "link" not in df.columns:
            raise ValueError("Input DataFrames must have columns: title, price, link")
        df["__tokens"] = df["title"].map(_tokens)

    # Greedy best-match per DJ item (no pm reuse)
    used_pm_idx = set()
    matches = []

    for _, row_dj in dj.iterrows():
        best = (-1.0, None)  # (sim, pm_idx)
        tok_dj = row_dj["__tokens"]
        if not tok_dj:
            continue

        for i_pm, row_pm in pm.iterrows():
            if i_pm in used_pm_idx:
                continue
            sim = _jaccard(tok_dj, row_pm["__tokens"])
            if sim > best[0]:
                best = (sim, i_pm)

        sim, i_pm_best = best
        if i_pm_best is not None and sim >= sim_thresh:
            used_pm_idx.add(i_pm_best)
            row_pm = pm.loc[i_pm_best]

            price_dj = row_dj["price"]
            price_pm = row_pm["price"]
            # Skip if price missing or non-numeric
            try:
                price_dj = float(price_dj)
                price_pm = float(price_pm)
            except Exception:
                continue

            matches.append({
                "title_dj": row_dj["title"],
                "title_pm": row_pm["title"],
                "price_diff": price_dj - price_pm,
                "price_dj": price_dj,
                "price_pm": price_pm,
                "link_dj": row_dj["link"],
                "link_pm": row_pm["link"],
            })

    compare_out = pd.DataFrame(
        matches,
        columns=["title_dj", "title_pm", "price_diff", "price_dj", "price_pm", "link_dj", "link_pm"]
    ).sort_values("price_diff", ascending=False).reset_index(drop=True)

    return compare_out

## run by brand

In [None]:
# misha
compare_out = compare_price(scrape_davidjones("misha"), scrape_iconic("misha-collection"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

In [277]:
# Lioness
compare_out = compare_price(scrape_davidjones("lioness"), scrape_iconic("lioness"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                       title_dj                        title_pm  price_diff  price_dj  price_pm                                                                                link_dj                                                           link_pm
    Lioness MIDNIGHT MAXI DRESS     Lioness Midnight Maxi Dress       59.00      89.0     30.00     https://www.davidjones.com/product/lioness-midnight-maxi-dress-27611495?nav=958119     https://www.theiconic.com.au/midnight-maxi-dress-2591496.html
      Lioness REBELS MINI DRESS       Lioness Rebels Mini Dress       43.45      55.3     11.85       https://www.davidjones.com/product/lioness-rebels-mini-dress-27551025?nav=958119       https://www.theiconic.com.au/rebels-mini-dress-2564638.html
           Lioness ENDLESS COAT            Lioness Endless Coat       36.45      99.0     62.55            https://www.davidjones.com/product/lioness-endless-coat-27467024?nav=958119            https://www.theiconic.com.au/endless-coat-2628204.html
Lion

In [278]:
# Calvin Klein
compare_out = compare_price(scrape_davidjones("calvin-klein"), scrape_iconic("calvin-klein"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

Empty DataFrame
Columns: [title_dj, title_pm, price_diff, price_dj, price_pm, link_dj, link_pm]
Index: []


In [279]:
# Assembly Label
compare_out = compare_price(scrape_davidjones("assembly-label"), scrape_iconic("assembly"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                                title_dj                                 title_pm  price_diff  price_dj  price_pm                                                                                         link_dj                                                             link_pm
       ASSEMBLY LABEL Osanna Knit Jumper        Assembly Label Osanna Knit Jumper       50.00     140.0     90.00        https://www.davidjones.com/product/assembly-label-osanna-knit-jumper-27698827?nav=943876        https://www.theiconic.com.au/osanna-knit-jumper-2582738.html
       ASSEMBLY LABEL Ryder Fleece Sweat        Assembly Label Ryder Fleece Sweat       47.50     100.0     52.50        https://www.davidjones.com/product/assembly-label-ryder-fleece-sweat-27482640?nav=943876        https://www.theiconic.com.au/ryder-fleece-sweat-2515352.html
           ASSEMBLY LABEL Blair Car Coat            Assembly Label Blair Car Coat       46.25     140.0     93.75            https://www.davidjones.com/product/assemb

In [None]:
# CAMILLA AND MARC
compare_out = compare_price(scrape_davidjones("camilla-and-marc"), scrape_iconic("camilla-marc"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                                       title_dj                                        title_pm  price_diff  price_dj  price_pm                                                                                               link_dj                                                                  link_pm
Camilla and Marc Ottilie 2.0 Short Sleeve Shirt CAMILLA AND MARC Ottilie 2.0 Short Sleeve Shirt         0.0     350.0     350.0 https://www.davidjones.com/product/camilla-and-marc-ottilie-20-short-sleeve-shirt-27711441?nav=884776 https://www.theiconic.com.au/ottilie-2-0-short-sleeve-shirt-2665932.html
                  Camilla and Marc Lolani Dress                   CAMILLA AND MARC Lolani Dress         0.0     550.0     550.0                  https://www.davidjones.com/product/camilla-and-marc-lolani-dress-27711447?nav=884776                   https://www.theiconic.com.au/lolani-dress-2665945.html
                    Camilla and Marc Astor Vest                     CAMILLA AND MARC Astor 

In [281]:
# Country Road
compare_out = compare_price(scrape_davidjones("country-road"), scrape_iconic("country-road"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                                                         title_dj                                                          title_pm  price_diff  price_dj  price_pm                                                                                                                  link_dj                                                                                        link_pm
         Country Road Organically Grown Cotton T-Shirt Logo Dress          Country Road Organically Grown Cotton T-shirt Logo Dress        10.0     69.95     59.95          https://www.davidjones.com/product/country-road-organically-grown-cotton-t-shirt-logo-dress-27655169?nav=883808          https://www.theiconic.com.au/organically-grown-cotton-t-shirt-logo-dress-2675340.html
Country Road Organically Grown Cotton Country Road Pocket T-Shirt Country Road Organically Grown Cotton Country Road Pocket T-shirt         5.0     39.95     34.95 https://www.davidjones.com/product/country-road-organically-grown-cotton-cou

In [282]:
# P.E Nation
compare_out = compare_price(scrape_davidjones("pe-nation"), scrape_iconic("pe-nation"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                         title_dj                          title_pm  price_diff  price_dj  price_pm                                                                                 link_dj                                                          link_pm
 P.E Nation Adventure Zip Through  P.E Nation Adventure Zip Through       63.60     159.0     95.40  https://www.davidjones.com/product/pe-nation-adventure-zip-through-27944155?nav=895781  https://www.theiconic.com.au/adventure-zip-through-2566515.html
        P.E Nation Shelter Jacket         P.E Nation Shelter Jacket       62.25     249.0    186.75         https://www.davidjones.com/product/pe-nation-shelter-jacket-27781000?nav=895781         https://www.theiconic.com.au/shelter-jacket-2596655.html
            P.E Nation Vita Sweat             P.E Nation Vita Sweat       57.60     129.0     71.40             https://www.davidjones.com/product/pe-nation-vita-sweat-27716426?nav=895781             https://www.theiconic.com.au/vita-sweat-2

In [283]:
# Marc Jacobs
compare_out = compare_price(scrape_davidjones("marc-jacobs"), scrape_iconic("the-marc-jacobs"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

                                title_dj                                 title_pm  price_diff  price_dj  price_pm                                                                                         link_dj                                                                link_pm
          Marc Jacobs The Large Tote Bag           Marc Jacobs The Large Tote Bag         NaN    348.75       NaN           https://www.davidjones.com/product/marc-jacobs-the-large-tote-bag-23533443?nav=949775           https://www.theiconic.com.au/the-large-tote-bag-2662966.html
 Marc Jacobs The Leather Medium Tote Bag  Marc Jacobs The Leather Medium Tote Bag         NaN    693.75       NaN  https://www.davidjones.com/product/marc-jacobs-the-leather-medium-tote-bag-24460791?nav=949775  https://www.theiconic.com.au/the-leather-medium-tote-bag-1537602.html
Marc Jacobs The Jacquard Medium Tote Bag Marc Jacobs The Jacquard Medium Tote Bag         NaN    502.50       NaN https://www.davidjones.com/product/marc-jac

In [None]:
# Tommy Hilfiger
compare_out = compare_price(scrape_davidjones("tommy-hilfiger"), scrape_iconic("tommy-hilfiger"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

In [None]:
# GUESS
compare_out = compare_price(scrape_davidjones("guess"), scrape_iconic("guess"), sim_thresh=0.9)
print(compare_out.head(5).to_string(index=False, line_width=None))

Empty DataFrame
Columns: [title_dj, title_pm, price_diff, price_dj, price_pm, link_dj, link_pm]
Index: []


# search by sales

In [None]:

def compare_search(pm, sim_thresh=0.9):
    matches = []
    SEARCH_BASE = "https://www.davidjones.com/search?q="

    pms = pm.to_dict("records")
    total = len(pms)
    for idx, i in enumerate(pms, 1):
    # for idx, i in enumerate(reversed(pms), 1):
        print(f"*{idx}/{total}", flush=True)
        title_pm = i["title"]
        search_url = f"{SEARCH_BASE}{quote_plus(title_pm)}"

        while(True):
            try:
                soup, heads = http_request(search_url)
                items = soup.select("div.item-detail")
            except Exception as e:
                print("[ERR] ", e)
                print("      title: ", title_pm)
                continue
            if _is_ssr_variant(soup, heads):
                break
            time.sleep(1)

        if not items:
            continue

        item = items[0]
        title, price, was, link = get_product_info(item)

        if was == i["was"] and _jaccard(_tokens(title), _tokens(title_pm)) >= sim_thresh:

            price_pm = i["price"]
            price_diff = price - price_pm
            link_pm = i["link"]

            if price_diff>100:
                matches.append({
                    "title_dj": title,
                    "title_pm": title_pm,
                    "price_diff": price_diff,
                    "price_dj": price,
                    "price_pm": price_pm,
                    "link_dj": link,
                    "link_pm": link_pm,
                })
            if price_diff>250:
                print(f"[***]\n{title}\n{price} - {price_pm} = {price_diff}\ndj: {link}\npm: {link_pm}")
            elif price_diff>200:
                print(f"[**]\n{title}\n{price} - {price_pm} = {price_diff}\ndj: {link}\npm: {link_pm}")
            elif price_diff>150:
                print(f"[*]\n{title}\n{price} - {price_pm} = {price_diff}\ndj: {link}\npm: {link_pm}")

    compare_out = pd.DataFrame(
        matches,
        columns=["title_dj", "title_pm", "price_diff", "price_dj", "price_pm", "link_dj", "link_pm"]
    ).sort_values("price_diff", ascending=False).reset_index(drop=True)

    return compare_out

In [None]:
ic2 = scrape_iconic("womens-sale", threshold=200)
print(ic2.to_string(index=False, line_width=None))

                                                               title   price     was                                                                                                link    diff
                          Karen Walker Rock Garden Ring with Peridot  698.70 2329.00                             https://www.theiconic.com.au/rock-garden-ring-with-peridot-1283115.html 1630.30
                            Rebecca Vallance Madeline Strapless Gown  511.20 1599.00                                   https://www.theiconic.com.au/madeline-strapless-gown-1700249.html 1087.80
                                       Solace London Neva Mini Dress  184.00 1150.00                                           https://www.theiconic.com.au/neva-mini-dress-2230375.html  966.00
          Proenza Schouler White Label Lato Wrap Dress In Linen Slub  356.00 1265.00                             https://www.theiconic.com.au/lato-wrap-dress-in-linen-slub-2511872.html  909.00
                     Polo Ralph Lau

In [377]:
out = compare_search(ic2)
print(out.head(10).to_string(index=False, line_width=None))

*1/1588
*2/1588
*3/1588
*4/1588
*5/1588
*6/1588
*7/1588
*8/1588
*9/1588
*10/1588
*11/1588
*12/1588
*13/1588
*14/1588
*15/1588
*16/1588
*17/1588
*18/1588
*19/1588
*20/1588
*21/1588
*22/1588
*23/1588
*24/1588
*25/1588
*26/1588
*27/1588
*28/1588
*29/1588
*30/1588
*31/1588
*32/1588
*33/1588
*34/1588
*35/1588
*36/1588
*37/1588
*38/1588
*39/1588
*40/1588
*41/1588
*42/1588
*43/1588
*44/1588
*45/1588
*46/1588
*47/1588
*48/1588
*49/1588
*50/1588
*51/1588
*52/1588
*53/1588
*54/1588
*55/1588
*56/1588
*57/1588
*58/1588
*59/1588
*60/1588
*61/1588
*62/1588
*63/1588
*64/1588
*65/1588
*66/1588
*67/1588
*68/1588
*69/1588
*70/1588
*71/1588
*72/1588
*73/1588
*74/1588
*75/1588
*76/1588
*77/1588
*78/1588
*79/1588
*80/1588
*81/1588
*82/1588
*83/1588
*84/1588
*85/1588
*86/1588
*87/1588
*88/1588
*89/1588
*90/1588
*91/1588
*92/1588
*93/1588
*94/1588
*95/1588
*96/1588
*97/1588
*98/1588
*99/1588
*100/1588
*101/1588
*102/1588
*103/1588
*104/1588
*105/1588
*106/1588
*107/1588
*108/1588
*109/1588
*110/1588
*111/158

In [None]:
vw2 = scrape_vw(threshold=200)
out = compare_search(vw2)
print(out.head(10).to_string(index=False, line_width=None))