In [2]:
import json, re
from pathlib import Path

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [3]:
def load_json(path: str) -> list:
    """
    Load a JSON or JSON-Lines file and always return **a list**.
    """
    txt = Path(path).read_text(encoding="utf-8").strip()
    if not txt:
        return []
    # JSON-Lines → many objects, one per line
    if "\n" in txt and not txt.lstrip().startswith("["):
        return [json.loads(line) for line in txt.splitlines() if line.strip()]
    # Normal JSON (could be list or single obj)
    obj = json.loads(txt)
    return obj if isinstance(obj, list) else [obj]

# ---- Load customer search history ----
search_raw = load_json("search_history.json")

In [7]:
def extract_query_text(o):
    """Return a plain text query from a raw history record."""
    if isinstance(o, str):
        return o
    if isinstance(o, dict):
        for k in ["query", "title", "search_term", "text", "header"]:
            if k in o and isinstance(o[k], str) and o[k].strip():
                return o[k]
        return " ".join(str(v) for v in o.values() if isinstance(v, str))
    return str(o)

search_history = [extract_query_text(x) for x in search_raw]
search_df = pd.DataFrame({"query": search_history})
search_df.head(10)

Unnamed: 0,query
0,Visited https://www.businessinsider.com/shivon...
1,Visited Elon Musk and Shivon Zilis privately w...
2,Searched for elon musk shivon zilis
3,1 notification
4,Searched for bank station fire alert
5,Searched for bank station fire alert
6,Searched for mukesh ambani house
7,Visited Teens could lose bank accounts and dri...
8,Visited Starmer: Sunak showing 'total lack of ...
9,Visited Sunak looked like a man who was runnin...


In [8]:
# ---- Load fashion catalogue ----
catalog_raw = load_json("fashion_catalog.json")
fashion_df  = pd.DataFrame(catalog_raw)

# ------------------------------------------------------------
# Gracefully determine a product’s display-name once and for all
# ------------------------------------------------------------
def get_product_name(row) -> str:
    return (
        row.get("product_name")
        or row.get("name")
        or row.get("PRODUCT_NAME")
        or row.get("title")
        or "Unnamed item"
    )

fashion_df["product_name"] = fashion_df.apply(get_product_name, axis=1)

print(f"Loaded {len(search_df)} search queries and {len(fashion_df)} catalog items.")
fashion_df.head(3)

Loaded 55383 search queries and 100000 catalog items.


Unnamed: 0,MAIN_IMAGE,SECOND_IMAGE,THIRD_IMAGE,FOURTH_IMAGE,LYST_PRODUCT_URL,GENDER,CATEGORY,SHORT_DESCRIPTION,LONG_DESCRIPTION,product_name
0,https://cdna.lystit.com/photos/wolfandbadger/5...,,,,https://www.lyst.com/shoes/laines-london-class...,F,flats,Classic Laines Slippers With Pearl Beaded Lobs...,Our faux fur cross strap classic slippers offe...,Unnamed item
1,https://cdna.lystit.com/photos/farfetch/d774e2...,https://cdna.lystit.com/photos/farfetch/8b088f...,https://cdna.lystit.com/photos/farfetch/6a4008...,,https://www.lyst.com/clothing/oscar-de-la-rent...,F,skirts,Dahlia Floral-print Midi Skirt,Material:Polyester. This item may not be on sa...,Unnamed item
2,https://cdna.lystit.com/photos/ssense/61ba817f...,https://cdna.lystit.com/photos/ssense/2289aeba...,https://cdna.lystit.com/photos/ssense/161426cf...,https://cdna.lystit.com/photos/ssense/b5c625a9...,https://www.lyst.com/clothing/coperni-green-as...,F,dresses,Asymmetric Minidress,Stretch nylon jersey dress. · Single-shoulder ...,Unnamed item


In [9]:
stopwords = {
    "the", "is", "are", "for", "a", "an", "to", "of", "and",
    "in", "what", "how", "best", "with",
}

def extract_keywords(text: str):
    text = text.lower()
    text = re.sub(r"[^a-z0-9]+", " ", text)
    words = text.split()
    return {w for w in words if w not in stopwords and len(w) > 2}

# ---- User interest keywords ----
interest_keywords = set()
for q in search_history:
    interest_keywords.update(extract_keywords(q))

print("Extracted interest keywords:")
print(sorted(list(interest_keywords))[:25], "...")

Extracted interest keywords:
['000', '0000', '00000000', '000000006445386005', '000000022198587011', '00001', '00002774', '000104746912002169', '0001079114', '000110465921001952', '000110465921072713', '0001193125', '000121390021008694', '0001578860', '0001673977', '0001691421', '0001693552', '0001708567', '0001783879', '0001795586', '00026', '0003192', '00039', '0004', '00044'] ...


In [11]:
# Build combined text and keyword list per item
def product_text(row):
    name = row.get("product_name")  # always set
    desc = (
        row.get("description")
        or row.get("SHORT_DESCRIPTION")
        or row.get("LONG_DESCRIPTION")
        or ""
    )
    tags = (
        row.get("tags")
        or row.get("TAGS")
        or row.get("tag_list")
        or row.get("TAG")
        or []
    )
    tags_str = " ".join(tags) if isinstance(tags, list) else str(tags)
    return f"{name} {desc} {tags_str}"

fashion_df["combined_text"]   = fashion_df.apply(product_text, axis=1)
fashion_df["product_keywords"] = fashion_df["combined_text"].apply(extract_keywords)
fashion_df["product_keywords"].head(10)

0        {classic, laines, lobster, brooches, shell, it...
1        {dahlia, midi, skirt, item, floral, print, unn...
2                   {asymmetric, unnamed, minidress, item}
3                  {billfold, slim, item, unnamed, wallet}
4                           {tees, unnamed, graphic, item}
                               ...                        
99995       {bomber, hoodie, style, item, jacket, unnamed}
99996                     {shorts, unnamed, bermuda, item}
99997                   {cloud, bag, item, carry, unnamed}
99998             {rwb, item, unnamed, stripe, sweatshirt}
99999                        {print, unnamed, shirt, item}
Name: product_keywords, Length: 100000, dtype: object

In [None]:
from tqdm.auto import tqdm
tqdm.pandas()                                   # turns .progress_apply on

fashion_df["product_keywords"] = (
    fashion_df["combined_text"]
    .progress_apply(extract_keywords)           # shows % and ETA
)

In [1]:
G = nx.Graph()

# Add nodes
for kw in interest_keywords:
    G.add_node(f"Interest:{kw}", type="interest", label=kw)

for _, prod in fashion_df.iterrows():
    prod_id = (
        prod.get("id")
        or prod.get("product_id")
        or prod.get("PRODUCT_ID")
        or f"idx_{prod.name}"
    )
    G.add_node(f"Product:{prod_id}", type="product", label=prod["product_name"])

# Add edges where keyword overlaps
for kw in interest_keywords:
    for _, prod in fashion_df.iterrows():
        prod_id = (
            prod.get("id")
            or prod.get("product_id")
            or prod.get("PRODUCT_ID")
            or f"idx_{prod.name}"
        )
        if kw in prod["product_keywords"]:
            G.add_edge(f"Interest:{kw}", f"Product:{prod_id}")

some_interests = list(interest_keywords)[:5]  # first five interests
viz_nodes = set()
for kw in some_interests:
    inode = f"Interest:{kw}"
    viz_nodes.add(inode)
    viz_nodes.update([v for u, v in G.edges(inode)])

subG = G.subgraph(viz_nodes)
pos  = nx.spring_layout(subG, seed=42)

plt.figure(figsize=(6,5))
nx.draw_networkx_nodes(
    subG, pos,
    node_color=["#8ecae6" if d["type"]=="interest" else "#90be6d" for _,d in subG.nodes(data=True)],
    node_size=600, alpha=0.9
)
nx.draw_networkx_labels(subG, pos, labels={n:d["label"] for n,d in subG.nodes(data=True)}, font_size=8)
nx.draw_networkx_edges(subG, pos, alpha=0.4, width=1.2)
plt.title("User-interest ↔ Product graph (subset)")
plt.axis("off")
plt.show()


Loaded 55383 search queries and 100000 catalog items.


Unnamed: 0,MAIN_IMAGE,SECOND_IMAGE,THIRD_IMAGE,FOURTH_IMAGE,LYST_PRODUCT_URL,GENDER,CATEGORY,SHORT_DESCRIPTION,LONG_DESCRIPTION,product_name
0,https://cdna.lystit.com/photos/wolfandbadger/5...,,,,https://www.lyst.com/shoes/laines-london-class...,F,flats,Classic Laines Slippers With Pearl Beaded Lobs...,Our faux fur cross strap classic slippers offe...,Unnamed item
1,https://cdna.lystit.com/photos/farfetch/d774e2...,https://cdna.lystit.com/photos/farfetch/8b088f...,https://cdna.lystit.com/photos/farfetch/6a4008...,,https://www.lyst.com/clothing/oscar-de-la-rent...,F,skirts,Dahlia Floral-print Midi Skirt,Material:Polyester. This item may not be on sa...,Unnamed item
2,https://cdna.lystit.com/photos/ssense/61ba817f...,https://cdna.lystit.com/photos/ssense/2289aeba...,https://cdna.lystit.com/photos/ssense/161426cf...,https://cdna.lystit.com/photos/ssense/b5c625a9...,https://www.lyst.com/clothing/coperni-green-as...,F,dresses,Asymmetric Minidress,Stretch nylon jersey dress. · Single-shoulder ...,Unnamed item


Extracted interest keywords:
['000', '0000', '00000000', '000000006445386005', '000000022198587011', '00001', '00002774', '000104746912002169', '0001079114', '000110465921001952', '000110465921072713', '0001193125', '000121390021008694', '0001578860', '0001673977', '0001691421', '0001693552', '0001708567', '0001783879', '0001795586', '00026', '0003192', '00039', '0004', '00044'] ...

Sample product keywords:
Unnamed item: ['beaded', 'brooches', 'classic', 'item', 'laines', 'lobster', 'pearl', 'shell', 'slippers', 'unnamed']
Unnamed item: ['dahlia', 'floral', 'item', 'midi', 'print', 'skirt', 'unnamed']
Unnamed item: ['asymmetric', 'item', 'minidress', 'unnamed']


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000014FBE62E1D0>>
Traceback (most recent call last):
  File "C:\Users\horva\miniconda3\envs\onfabric\Lib\site-packages\ipykernel\ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 

KeyboardInterrupt


KeyboardInterrupt

