## Sitemap Scraping

In [4]:
import requests
import xml.etree.ElementTree as ET

def get_sitemap_urls(base_url):
    """Fetches URLs from a website's sitemap.xml if available."""
    sitemap_url = base_url.rstrip("/") + "/sitemap.xml"
    
    try:
        response = requests.get(sitemap_url)
        if response.status_code == 200:
            root = ET.fromstring(response.content)
            urls = [elem.text for elem in root.iter() if elem.tag.endswith("loc")]
            return urls
        else:
            print("No sitemap found.")
            return []
    except Exception as e:
        print(f"Error fetching sitemap: {e}")
        return []

# Example usage
sitemap_links = get_sitemap_urls("https://www.tacchini.it/en/")
print(sitemap_links)

['https://www.tacchini.it/en/post-sitemap.xml', 'https://www.tacchini.it/en/page-sitemap.xml', 'https://www.tacchini.it/en/realizzazioni-sitemap.xml', 'https://www.tacchini.it/en/video-sitemap.xml', 'https://www.tacchini.it/en/designers-sitemap.xml', 'https://www.tacchini.it/en/journal-sitemap.xml', 'https://www.tacchini.it/en/tipologia-sitemap.xml', 'https://www.tacchini.it/en/tacchiniedizioni-sitemap.xml', 'https://www.tacchini.it/en/savoir-faire-sitemap.xml', 'https://www.tacchini.it/en/category-sitemap.xml', 'https://www.tacchini.it/en/realiztax-sitemap.xml', 'https://www.tacchini.it/en/videotax-sitemap.xml', 'https://www.tacchini.it/en/journaltax-sitemap.xml', 'https://www.tacchini.it/en/distributoritax-sitemap.xml', 'https://www.tacchini.it/en/tacchiniedizionitax-sitemap.xml', 'https://www.tacchini.it/en/savoir-faire-sitemap.xml']


In [None]:
import requests
import xml.etree.ElementTree as ET
import json

# List of important sitemaps
sitemaps = [
    "https://www.tacchini.it/en/post-sitemap.xml"
]

def extract_urls_from_sitemap(sitemap_url):
    """Fetch and extract URLs from a given sitemap XML."""
    try:
        response = requests.get(sitemap_url)
        if response.status_code == 200:
            root = ET.fromstring(response.content)
            urls = [elem.text for elem in root.iter() if elem.tag.endswith("loc")]
            return urls
        else:
            print(f"Failed to fetch: {sitemap_url}")
            return []
    except Exception as e:
        print(f"Error processing {sitemap_url}: {e}")
        return []

# Store all product-related URLs
all_product_urls = []

for sitemap in sitemaps:
    urls = extract_urls_from_sitemap(sitemap)
    all_product_urls.extend(urls)

# Save to a JSON file
with open("jsons/product_urls.json", "w") as f:
    json.dump(all_product_urls, f, indent=4)

print(f"✅ Extracted {len(all_product_urls)} URLs and saved to product_urls.json")

✅ Extracted 207 URLs and saved to product_urls.json


In [None]:
import json

# Load JSON file with links
with open("jsons/product_urls.json", "r", encoding="utf-8") as file:
    weblinks = json.load(file)

# Define dynamic heuristics
IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg", ".webp", ".gif")
IGNORED_TERMS = ["about", "contact", "privacy", "terms", "blog", "journal", "distributor", "video"]

def score_link(link):
    """Assigns a score to a link based on its relevance."""
    score = 0

    # Check for images (Exclude)
    if link.endswith(IMAGE_EXTENSIONS):
        return -1  

    # Remove homepage & short URLs
    if link.count("/") <= 4:  # E.g., 'https://www.tacchini.it/en/' has only 4 slashes
        return -1  

    # Penalize ignored terms
    if any(term in link for term in IGNORED_TERMS):
        return -1  

    # Boost score for deep URLs (longer paths = likely useful)
    path_parts = link.strip("/").split("/")
    score += len(path_parts) * 2  

    # Extra boost for meaningful words
    for part in path_parts:
        if len(part) > 3:  
            score += 2

    return score


# Apply scoring to all links
scored_links = {link: score_link(link) for link in weblinks}

# Filter out irrelevant links (score = -1) and sort by score
filtered_links = sorted(
    [link for link, score in scored_links.items() if score > 0],
    key=lambda x: scored_links[x],
    reverse=True  # Higher score first
)

# Save the dynamically filtered links
with open("jsons/filtered_urls.json", "w", encoding="utf-8") as outfile:
    json.dump(filtered_links, outfile, indent=4)

print("✅ Dynamically filtered links saved to filtered_urls.json")


✅ Dynamically filtered links saved to filtered_dynamic_weblinks.json


In [None]:
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Path to your local ChromeDriver
CHROMEDRIVER_PATH = r"C:\Program Files (x86)\chromedriver.exe"

# Function to initialize Selenium WebDriver
def get_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in background
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--ignore-certificate-errors")

    service = Service(CHROMEDRIVER_PATH)  # Use local ChromeDriver
    return webdriver.Chrome(service=service, options=options)

# Function to extract dynamic content
def scrape_page(url):
    driver = get_driver()
    try:
        print(f"🔍 Scraping: {url}")
        driver.get(url)

        # Wait for the page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        # Expand all collapsible sections dynamically
        expandable_elements = driver.find_elements(By.XPATH, "//button | //summary | //div[@role='button']")
        for element in expandable_elements:
            try:
                driver.execute_script("arguments[0].click();", element)
                time.sleep(1)  # Wait for content to load
            except:
                continue  # Ignore if it can't be clicked

        # Extract full HTML after interactions
        full_html = driver.page_source
        print(f"✅ Scraped successfully: {url}")

        return full_html

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
        return None
    finally:
        driver.quit()

# Load links from JSON
with open("jsons/filtered_urls.json", "r", encoding="utf-8") as file:
    filtered_links = json.load(file)

# Test one link
test_link = filtered_links[0]
html_content = scrape_page(test_link)

# Save the extracted HTML
if html_content:
    with open("html/scraped_page.html", "w", encoding="utf-8") as file:
        file.write(html_content)

print("✅ Scraped HTML saved to scraped_page.html")


🔍 Scraping: https://www.tacchini.it/en/low-tables/daze/
✅ Scraped successfully: https://www.tacchini.it/en/low-tables/daze/
✅ Scraped HTML saved to scraped_page.html


In [None]:
from bs4 import BeautifulSoup

# Load the scraped HTML
with open("html/scraped_page.html", "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse HTML with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Remove common unnecessary elements
for tag in soup(["script", "style", "footer", "nav", "aside", "meta", "link", "svg"]):
    tag.extract()

# Remove hidden elements (e.g., display:none)
for tag in soup.find_all(style=lambda value: value and "display:none" in value):
    tag.extract()

# Remove empty tags (helps clean up clutter)
for tag in soup.find_all(lambda tag: not tag.text.strip() and not tag.name in ["img", "br"]):
    tag.extract()

# Convert cleaned HTML to string
cleaned_html = soup.prettify()

# Save cleaned HTML
with open("html/cleaned_page.html", "w", encoding="utf-8") as file:
    file.write(cleaned_html)

print("✅ Cleaned HTML saved to cleaned_page.html")


✅ Cleaned HTML saved to cleaned_page.html


In [None]:
from bs4 import BeautifulSoup
import markdownify

# Load full HTML
with open("html/cleaned_page.html", "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse HTML
soup = BeautifulSoup(html_content, "html.parser")

# Convert to Markdown
markdown_content = markdownify.markdownify(str(soup), heading_style="ATX")  # Uses # for headings

# Save as one file
with open("markdown/full_extracted.md", "w", encoding="utf-8") as file:
    file.write(markdown_content)

print("✅ Full Markdown file saved: full_extracted.md")

✅ Full Markdown file saved: full_extracted.md


### Prompted chatgpt to extract relevant information regarding furniture only this page is for product named "daze" i want information related to it only 
#### then it gave me furniture_data.json

Use Crawl4AI and create llm strategy to convert html to markdown

In [None]:
import json


with open("jsons/furniture_data.json", "r", encoding="utf-8") as file:
    raw_data = json.load(file)

# Initialize structured dictionary
product_data = {
    "product_name": None,
    "description": None,
    "designer": None,
    "year": None,
    "dimensions": None,
    "materials_and_finishes": None,
    "images": [],
    "designer_bio": None
}

# Extract product name
for key in raw_data:
    if "Daze" in key:
        product_data["product_name"] = key.strip("# ")

# Extract description
for key, value in raw_data.items():
    if "Architectural shapes" in key:  # Recognizing description structure
        product_data["description"] = key

# Extract designer and year
for key, value in raw_data.items():
    if "Designer" in key:
        product_data["designer"] = key.replace("Designer: ", "").strip()
    if "Year" in key:
        product_data["year"] = value.replace("△", "").strip()

# Extract dimensions
for key, value in raw_data.items():
    if "Dimensions" in key:
        product_data["dimensions"] = value.replace("△", "").strip()

# Extract materials and finishes
for key, value in raw_data.items():
    if "Materials and finishes" in key:
        product_data["materials_and_finishes"] = value.strip()

# Extract images
for key, value in raw_data.items():
    if key.startswith("![]("):  # Image Markdown format
        image_url = key.replace("![](", "").replace(")", "").strip()
        product_data["images"].append(image_url)

# Extract designer bio (Search both keys and values)
for key, value in raw_data.items():
    if "Truly Truly" in key or "Truly Truly" in str(value):
        product_data["designer_bio"] = value.strip()

# Convert to JSON format
final_json_path = "jsons/structured_daze_data.json"
with open(final_json_path, "w", encoding="utf-8") as json_file:
    json.dump(product_data, json_file, indent=4, ensure_ascii=False)

final_json_path


'structured_daze_data.json'