In [None]:
#9833891391

In [1]:
import requests
import json
from bs4 import BeautifulSoup
import re 

In [2]:
# Base URL
BASE_URL = "https://www.tacchini.it/en/"
COLLECTION_URL = "https://www.tacchini.it/en/collections/"

In [3]:
def fetch_soup(url):
    """Fetch HTML content and return a BeautifulSoup object."""
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        print(f"Failed to fetch {url}, Status Code: {response.status_code}")
        return None

In [4]:
def extract_products(soup):
    """Extract product links and basic info from the collection page."""
    products = []
    product_divs = soup.find_all('div', class_='titolo-prodotto')

    for product in product_divs:
        a_tag = product.find('a')
        if a_tag:
            name = a_tag.contents[0].strip()
            designer_tag = a_tag.find('span', class_='designer')
            designer_name = designer_tag.text.strip() if designer_tag else "Unknown"
            product_link = BASE_URL + a_tag['href']
            
            products.append({
                "name": name,
                "designer": designer_name,
                "link": product_link
            })

    return products

In [5]:
# experimental 2
def extract_product_details(product):
    """Scrape individual product details from its page."""
    soup = fetch_soup(product["link"])
    if not soup:
        return None
    
    # Extract description: Get text from all child elements
    description = "No description available"
    description_div = soup.find('div', class_=re.compile(r'text-normal\s+visibile'))  # Regex match

    if description_div:
        paragraphs = [p.get_text(strip=True) for p in description_div.find_all(['p', 'span', 'div']) if p.get_text(strip=True)]
        description = " ".join(paragraphs) if paragraphs else "No description available"

    # Backup: Try meta description if div extraction fails
    if description == "No description available":
        meta_desc = soup.find("meta", attrs={"name": "description"})
        if meta_desc and "content" in meta_desc.attrs:
            description = meta_desc["content"]  

    # Extract image
    image_url = "No image available"
    img_tag = soup.find('img', src=re.compile(r'https://www\.tacchini\.it/wp-content/uploads/'))
    if img_tag and "src" in img_tag.attrs:
        image_url = img_tag["src"]

    # Extract dimensions
    dimensions_div = soup.find_all('div', class_='testo-tecnico')
    dimensions = [dim.get_text(strip=True) for dim in dimensions_div] if dimensions_div else []
    
    # Extract materials
    materials_div = soup.find('div', class_='left-column-a')
    materials = [mat.get_text(strip=True) for mat in materials_div.find_all('div', class_='text-a')] if materials_div else []
    
    # Return updated product details
    product.update({
        "description": description,
        "image": image_url,
        "dimensions": dimensions,
        "materials": materials
    })
    return product


In [6]:
def save_to_json(data, filename="products.json"):
    """Save the extracted data to a JSON file."""
    with open(filename, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4, ensure_ascii=False)
    print(f"Scraping complete. Data saved to {filename}")

In [7]:
def main():
    """Main execution flow."""
    soup = fetch_soup(COLLECTION_URL)
    if not soup:
        return

    products = extract_products(soup)
    product_data = []

    for idx, product in enumerate(products, start=1):
        print(f"Scraping product {idx}/{len(products)}: {product['name']}")  # Progress log
        product_details = extract_product_details(product)
        if product_details:
            product_details["id"] = idx
            product_data.append(product_details)

    save_to_json(product_data)

In [8]:
main()

Scraping product 1/146: Solar
Scraping product 2/146: Additional System
Scraping product 3/146: Le Mura
Scraping product 4/146: Victoria
Scraping product 5/146: Orsola
Scraping product 6/146: Julep
Scraping product 7/146: Julep Chaise-longue
Scraping product 8/146: Julep Island
Scraping product 9/146: Roma Nuvola
Scraping product 10/146: Roma


KeyboardInterrupt: 

In [63]:
import json
import re

def normalize_json(file_path, output_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    for product in data:
        # Normalize dimensions
        if 'dimensions' in product and isinstance(product['dimensions'], list):
            new_dimensions = []
            for dim in product['dimensions']:
                dim_values = re.findall(r'\d+', dim)  # Extract numeric values
                if len(dim_values) >= 3:
                    dimension_entry = {
                        "code": dim.split()[0],  # Assuming first word is the code
                        "width": int(dim_values[0]),
                        "depth": int(dim_values[1]),
                        "height": int(dim_values[2])
                    }
                    if len(dim_values) > 3:
                        dimension_entry["seat_height"] = int(dim_values[3])
                    new_dimensions.append(dimension_entry)
            product['dimensions'] = new_dimensions
        
        # Normalize materials
        if 'materials' in product and isinstance(product['materials'], str):
            product['materials'] = [m.strip() for m in product['materials'].split(';')]
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"Normalized JSON saved to {output_path}")

# Example usage
normalize_json('products.json', 'normalized_product.json')


Normalized JSON saved to normalized_product.json


In [64]:
import json
import pandas as pd

def json_to_csv(json_file, csv_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    rows = []

    for product in data:
        product_base = {
            "ID": product.get("id"),
            "Name": product.get("name"),
            "Designer": product.get("designer"),
            "Link": product.get("link"),
            "Description": product.get("description"),
            "Image": product.get("image"),
            "Materials": "; ".join(product.get("materials", [])),  # Join materials into a single string
        }

        # Handling multiple dimension sets by creating separate rows for each
        for dimension in product.get("dimensions", []):
            row = product_base.copy()
            row.update({
                "Code": dimension.get("code"),
                "Width": dimension.get("width"),
                "Depth": dimension.get("depth"),
                "Height": dimension.get("height"),
                "Seat Height": dimension.get("seat_height"),
            })
            rows.append(row)

    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(rows)

    # Save to CSV
    df.to_csv(csv_file, index=False, encoding='utf-8')

# Example usage:
json_to_csv("normalized_product.json", "product.csv")
