# Data Collection

## Get product informations

In [5]:
!pip install requests beautifulsoup4

import requests
from bs4 import BeautifulSoup
import json

BASE_URL = "https://toprankleather.com"


def get_nearby_text(node):
    """Find the metafield text located near a given section title."""
    if not node:
        return None
    metafield = node.find_next("span", class_="metafield-multi_line_text_field")
    if metafield:
        return metafield.get_text("\n", strip=True)
    return None


def extract_dropdown_options(soup, field_name):
    """Extract <option> values from any <select> with name='properties[field_name]'."""
    options = []
    select_box = soup.find("select", {"name": f"properties[{field_name}]"})
    if select_box:
        for opt in select_box.find_all("option"):
            txt = opt.get_text(strip=True)
            if txt and "Select option" not in txt:
                options.append(txt)
    return options if options else None


def scrape_product(url):
    """Tek bir ürün sayfasından tüm ilgili bilgileri çeker."""
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    data = {}

    # 1) TITLE
    try:
        data["title"] = soup.find("h1", class_="product-title").get_text(strip=True)
    except:
        data["title"] = None

    # 2) LEATHER TYPE & COLOR (dropdown)
    data["leather_options"] = extract_dropdown_options(soup, "Leather Type & Color")

    # 2B) LEATHER TYPE DESCRIPTION (metafield)
    leather_title = soup.find(lambda tag: tag.name == "p" and "Leather Type" in tag.get_text())
    data["leather_description"] = get_nearby_text(leather_title)

    # 3) SNAP BUTTON COLOR (dropdown)
    data["snap_button_color_options"] = extract_dropdown_options(soup, "Snap Button Color")

    # 4) THREAD COLOR (dropdown)
    data["thread_color_options"] = extract_dropdown_options(soup, "Thread Color")

    # 5) ESTIMATED DELIVERY
    try:
        est = soup.select_one("div.estimate-content p")
        data["estimated_delivery"] = est.get_text(" ", strip=True) if est else None
    except:
        data["estimated_delivery"] = None

    # 6) DESCRIPTION & ALT DESCRIPTION (tüm rte blokları)
    rte_blocks = soup.find_all("div", class_="rte")

    if len(rte_blocks) >= 1:
        data["description"] = rte_blocks[0].get_text("\n", strip=True)
    else:
        data["description"] = None

    if len(rte_blocks) >= 2:
        alt_texts = [
            block.get_text("\n", strip=True)
            for block in rte_blocks[1:]
        ]
        data["description_alt"] = "\n\n".join(alt_texts)
    else:
        data["description_alt"] = None

    # 7) PERSONALIZATION
    personalization_title = soup.find(lambda tag: tag.name == "p" and "Personalization" in tag.get_text())
    data["personalization"] = get_nearby_text(personalization_title)

    # 8) BRAND LOGO
    logo_title = soup.find(lambda tag: tag.name == "p" and ("Logo" in tag.get_text() or "logo" in tag.get_text()))
    data["brand_logo"] = get_nearby_text(logo_title)

    # 9) LIFETIME GUARANTEE
    guarantee_title = soup.find(lambda tag: tag.name == "p" and "Lifetime" in tag.get_text())
    data["lifetime_guarantee"] = get_nearby_text(guarantee_title)

    # URL
    data["url"] = url

    return data


def get_unique_products_from_collection():
    """Kategori sayfasından uniq (title bazlı) ürün linklerini döner."""
    collection_url = f"{BASE_URL}/collections/personalized-leather-accessories"
    r = requests.get(collection_url)
    soup = BeautifulSoup(r.text, "html.parser")

    product_map = {}  # title -> url

    for a in soup.select("a.x-card-title"):
        title = a.get_text(strip=True)
        href = a.get("href")

        if not title or not href:
            continue

        # Aynı başlık daha önce eklendiyse atla (22 → 11 fix)
        if title in product_map:
            continue

        full_url = href if href.startswith("http") else BASE_URL + href
        product_map[title] = full_url

    return product_map


# --- ÇALIŞTIRMA ---

product_map = get_unique_products_from_collection()

print("Bulunan uniq ürün sayısı:", len(product_map))
for t, u in product_map.items():
    print("-", t, "→", u)

# Her ürün için detay çek
all_products = []
for title, url in product_map.items():
    print("\nScraping:", title)
    data = scrape_product(url)
    all_products.append(data)

# JSON olarak kaydet
with open("products.json", "w", encoding="utf-8") as f:
    json.dump(all_products, f, ensure_ascii=False, indent=2)

print("\nKaydedilen ürün sayısı:", len(all_products))
print("products.json hazır.")


Bulunan uniq ürün sayısı: 11
- Strapold Elite → https://toprankleather.com/collections/personalized-leather-accessories/products/strapold-elite-slim-handmade-wallet
- Strapold Mini → https://toprankleather.com/collections/personalized-leather-accessories/products/strapold-mini-slim-bifold-wallet
- Origa Elite → https://toprankleather.com/collections/personalized-leather-accessories/products/origa-elite-stitchless-leather-cardholder
- Origa Mini → https://toprankleather.com/collections/personalized-leather-accessories/products/origa-mini-stitchless-leather-card-holder
- Trifold Slim → https://toprankleather.com/collections/personalized-leather-accessories/products/trifold-slim-handmade-leather-wallet
- Strapold Elite - Stitchless → https://toprankleather.com/collections/personalized-leather-accessories/products/leather-card-wallet-holder-holders-credit-handmade-wallets-full-grain
- Snapold Mini → https://toprankleather.com/collections/personalized-leather-accessories/products/snapold-mi

## Get FAQ (Frequently Asked Questions)

In [7]:
import requests
from bs4 import BeautifulSoup
import json

faq_url = "https://toprankleather.com/pages/faq"

response = requests.get(faq_url)
soup = BeautifulSoup(response.text, "html.parser")

faq_items = []

# Her FAQ bloğu: dış container = <div class="p-3 ...">
faq_blocks = soup.select("div.p-3")

for block in faq_blocks:

    # --- SORU BAŞLIĞI --- #
    title_tag = block.select_one("button.faq-collapsible p.font-medium")

    if not title_tag:
        continue

    question = title_tag.get_text(strip=True)

    # --- CEVAP --- #
    # Cevap genelde .faq-content > .rte içindeki tüm <p> metinlerinden oluşuyor
    answer_container = block.select_one(".faq-content .rte")

    if answer_container:
        answer_text = answer_container.get_text("\n", strip=True)
    else:
        answer_text = None

    faq_items.append({
        "question": question,
        "answer": answer_text
    })


# JSON kaydet
with open("faq.json", "w", encoding="utf-8") as f:
    json.dump(faq_items, f, ensure_ascii=False, indent=2)

print("Toplam FAQ:", len(faq_items))
print("faq.json oluşturuldu!")

Toplam FAQ: 26
faq.json oluşturuldu!


## Get JSONL file for RAG

In [8]:
import json
import re

# --- Chunking function ---
def chunk_text(text, max_chars=1500):
    if not text:
        return []

    paragraphs = text.split("\n")
    chunks = []
    current = ""

    for p in paragraphs:
        if len(current) + len(p) + 1 < max_chars:
            current += p + "\n"
        else:
            chunks.append(current.strip())
            current = p + "\n"

    if current.strip():
        chunks.append(current.strip())

    return chunks


# --- Load products ---
with open("products.json", "r", encoding="utf-8") as f:
    products = json.load(f)

# --- Load faq ---
with open("faq.json", "r", encoding="utf-8") as f:
    faqs = json.load(f)


output_path = "rag_data.jsonl"

with open(output_path, "w", encoding="utf-8") as out:

    # ---------- PRODUCTS ----------
    for p in products:

        full_text = ""

        # Build a combined description text
        if p.get("description"):
            full_text += p["description"] + "\n\n"

        if p.get("description_alt"):
            full_text += p["description_alt"] + "\n\n"

        if p.get("leather_description"):
            full_text += "LEATHER INFO:\n" + p["leather_description"] + "\n\n"

        if p.get("personalization"):
            full_text += "PERSONALIZATION:\n" + p["personalization"] + "\n\n"

        if p.get("brand_logo"):
            full_text += "BRAND LOGO INFORMATION:\n" + p["brand_logo"] + "\n\n"

        if p.get("lifetime_guarantee"):
            full_text += "LIFETIME GUARANTEE:\n" + p["lifetime_guarantee"] + "\n\n"

        # Metadata-based description
        meta_info = []

        if p.get("leather_options"):
            meta_info.append("Leather Type & Color options: " + ", ".join(p["leather_options"]))

        if p.get("snap_button_color_options"):
            meta_info.append("Snap Button Color options: " + ", ".join(p["snap_button_color_options"]))

        if p.get("thread_color_options"):
            meta_info.append("Thread Color options: " + ", ".join(p["thread_color_options"]))

        if p.get("estimated_delivery"):
            meta_info.append("Estimated delivery: " + p["estimated_delivery"])

        if meta_info:
            full_text += "PRODUCT OPTIONS:\n" + "\n".join(meta_info)

        # Clean text
        full_text = re.sub(r"\s+", " ", full_text).strip()

        # Chunk
        chunks = chunk_text(full_text, max_chars=1500)

        # Write JSONL entries
        for i, chunk in enumerate(chunks):
            record = {
                "id": f"product_{p['title'].replace(' ', '_').lower()}_{i}",
                "text": chunk,
                "metadata": {
                    "type": "product",
                    "title": p["title"],
                    "url": p.get("url", "")
                }
            }
            out.write(json.dumps(record, ensure_ascii=False) + "\n")

    # ---------- FAQ ----------
    for faq in faqs:
        record = {
            "id": "faq_" + faq["question"].replace(" ", "_").lower(),
            "text": faq["question"] + "\n\n" + faq["answer"],
            "metadata": {
                "type": "faq",
                "question": faq["question"]
            }
        }
        out.write(json.dumps(record, ensure_ascii=False) + "\n")


print("RAG dosyası oluşturuldu:", output_path)


RAG dosyası oluşturuldu: rag_data.jsonl


## Intent Detection Base Dataset

In [9]:
dataset = """text,intent
Do you have any wallets in green color?,product_query
What is the size of Strapold Mini?,product_query
Is the Strapold Elite available in natural leather?,product_query
How many cards can the Slim Handmade Wallet hold?,product_query
Do you have AirTag holders in black?,product_query
What leather options are available for this product?,product_query
Is there a thread color option for this model?,product_query
Do you have a wallet with snap button color 'Nickel'?,product_query
Does this wallet support personalization?,product_query
Is this product handmade?,product_query
What colors does the Mini Bifold come in?,product_query
Does the wallet come with lifetime guarantee?,product_query
What is the thickness of the leather?,product_query
Is the Burgundy option available right now?,product_query
What is the difference between Natural and Cognac colors?,product_query
Do you have any minimalist wallets?,product_query
What’s the estimated delivery for this product?,product_query
Is there logo-free option for this item?,product_query
How durable is the stitching?,product_query
Does this holder support keychain attachment?,product_query
What is your return policy?,policy_query
Do you offer international shipping?,policy_query
How long does shipping take?,policy_query
Do you accept returns for personalized items?,policy_query
What is your warranty policy?,policy_query
Where are your products made?,policy_query
Do you ship to Germany?,policy_query
Do I have to pay customs fees?,policy_query
How can I contact customer support?,policy_query
What payment methods do you accept?,policy_query
Is express shipping available?,policy_query
What happens if my order arrives damaged?,policy_query
Can I cancel my order?,policy_query
Do you offer refunds?,policy_query
What is your production time for custom orders?,policy_query
Are your materials ethically sourced?,policy_query
Do you provide order tracking?,policy_query
What are your packaging materials?,policy_query
Is there a guarantee for defective products?,policy_query
How do I exchange an item?,policy_query
Hi, how are you?,other
Hello there!,other
Thanks!,other
Can you help me?,other
I need some advice.,other
What do you recommend?,other
Tell me more about your company.,other
I'm just browsing.,other
You seem helpful!,other
Is this a chatbot?,other
I'm not sure what I need yet.,other
Can you guide me?,other
Do you understand Turkish?,other
That’s interesting.,other
How does this work?,other
Nice website!,other
What can you do?,other
I'm looking around.,other
Tell me something.,other
I have a question.,other
"""

with open("intent_dataset.csv", "w", encoding="utf-8") as f:
    f.write(dataset)

print("intent_dataset.csv oluşturuldu!")


intent_dataset.csv oluşturuldu!


In [10]:
!git init
!git config --global user.email "fikirkocakagan@gmail.com"
!git config --global user.name "Grhanas"


[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/


In [12]:
!git remote add origin https://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@github.com/Grhanas/Intent-Detection-Enhanced-RAG-Based-Chatbot-for-Store-Product-and-Policy-Information.git


In [18]:
!git checkout -b data_process


Switched to a new branch 'data_process'


In [19]:
!git add .
!git commit -m "Add data collection scripts and datasets"


On branch data_process
nothing to commit, working tree clean


In [20]:
!git push -u origin data_process


Enumerating objects: 32, done.
Counting objects:   3% (1/32)Counting objects:   6% (2/32)Counting objects:   9% (3/32)Counting objects:  12% (4/32)Counting objects:  15% (5/32)Counting objects:  18% (6/32)Counting objects:  21% (7/32)Counting objects:  25% (8/32)Counting objects:  28% (9/32)Counting objects:  31% (10/32)Counting objects:  34% (11/32)Counting objects:  37% (12/32)Counting objects:  40% (13/32)Counting objects:  43% (14/32)Counting objects:  46% (15/32)Counting objects:  50% (16/32)Counting objects:  53% (17/32)Counting objects:  56% (18/32)Counting objects:  59% (19/32)Counting objects:  62% (20/32)Counting objects:  65% (21/32)Counting objects:  68% (22/32)Counting objects:  71% (23/32)Counting objects:  75% (24/32)Counting objects:  78% (25/32)Counting objects:  81% (26/32)Counting objects:  84% (27/32)Counting objects:  87% (28/32)Counting objects:  90% (29/32)Counting objects:  93% (30/32)Counting objects:  96% (31/32)Counting objects: