In [79]:
from urllib.parse import urlparse
import re

In [80]:
# Parse Pinterest board URL to extract board name
def parse_board_name(url: str) -> str:
    parsed_url = urlparse(url)
    path_match = re.search(r"\.com/([^/]+)/([^/]+)/", parsed_url.geturl())
    if path_match:
        return f"{path_match.group(1)}/{path_match.group(2)}"
    else:
        raise ValueError("Invalid Pinterest board URL format")

In [81]:
parse_board_name("https://es.pinterest.com/sandramedinadom/bedroom-inspiration/?invite_code=3891d52d47ce49d49e64512fc53ed142&sender=424464471038145989")

'sandramedinadom/bedroom-inspiration'

In [1]:
board_name = 'sandramedinadom/city-clothes'
kwargs = {'arg_path': board_name, 'arg_dir': 'images', 'arg_thread_max': 0, 
 'arg_cut': -1, 'arg_board_timestamp': False, 'arg_log_timestamp': False, 
 'arg_force': False, 'arg_exclude_section': False, 'arg_rescrape': False, 
 'arg_img_only': False, 'arg_v_only': False, 'arg_update_all': False, 
 'arg_https_proxy': None, 'arg_http_proxy': None, 'arg_cookies': None}


import importlib.util
import sys

# Define the path to the pinterest-downloader.py file
file_path = './pinterest-downloader/pinterest-downloader.py'

# Load the module
spec = importlib.util.spec_from_file_location("pinterest_downloader_module", file_path)
pinterest_downloader_module = importlib.util.module_from_spec(spec)
sys.modules["pinterest_downloader_module"] = pinterest_downloader_module
spec.loader.exec_module(pinterest_downloader_module)

# Now you can access the function run_library_main
image_urls = pinterest_downloader_module.run_library_main(**kwargs)

[i] User Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36
[i] Job is download single board by username/boardname: sandramedinadom/city-clothes
[...] Getting all boards [ 99 / ? ][➕] Found 115 Boards.
[...] Getting all images in this board: city-clothes ... [ 307 / ? ] [➕] Found 316 image/videos
Download into directory:  images/sandramedinadom/city-clothes/


In [2]:
image_urls[:5]

['https://i.pinimg.com/originals/a4/dd/16/a4dd16826c273e02b5ab40c6d8391821.jpg',
 'https://i.pinimg.com/originals/5f/2e/5d/5f2e5d63d9ddddff37e0627ed6b8c5d6.jpg',
 'https://i.pinimg.com/originals/5e/18/ff/5e18ff52c3fb173b1fbab5c1ca2b97fb.jpg',
 'https://i.pinimg.com/originals/74/1d/85/741d85c13dcae6d8974d3f0af239b9fc.jpg',
 'https://i.pinimg.com/originals/9b/28/83/9b2883f7ed65ff7316c9b9423005494d.jpg']

In [3]:
mood_board_image_urls = image_urls

In [4]:
import json
from openai import OpenAI
client = OpenAI(api_key=json.load(open('secrets.json', 'r'))['OPENAI_KEY'])

In [5]:
from pydantic import BaseModel

class Step(BaseModel):
    explanation: str
    output: str

class SearchQueryGeneration(BaseModel):
    steps: list[Step]
    final_answer: list[str]

completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[
        {
            "role": "system",
            "content": (
                "You are a shopping assistant. Based on the mood board images provided, "
                "generate a list of search queries to help the user find similar products. "
                "Explain your reasoning step by step."
            ),
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Based on these images, what search queries should I use to find similar products?"},
            ] + [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": i,
                    },
                } for i in mood_board_image_urls[:5]
            ],
        },
    ],
    response_format=SearchQueryGeneration,
)

search_query_results = completion.choices[0].message.parsed


In [6]:
search_query_results.dict()

{'steps': [{'explanation': 'The first image features a woman wearing an oversized gray sweater, a black skirt, black over-the-knee boots, and a dark scarf. This style is casual and stylish, suitable for fall/winter. Key items include the chunky knit sweater and over-the-knee boots.',
   'output': '"oversized gray sweater women", "chunky knit sweater fall outfit", "black over-the-knee boots women"'},
  {'explanation': 'The second image shows a woman in a black outfit with gold accents, including a leopard-print scarf, studded shorts, and black over-the-knee boots. The focus is on bold accessories and sleek styling.',
   'output': '"black over-the-knee boots women", "leopard print scarf", "studded shorts women", "black gold accent outfit"'},
  {'explanation': 'The third image has a woman in a white blazer, black shorts, and black over-the-knee boots, which is a more polished look. The white blouse with a bow tie adds elegance.',
   'output': '"white blazer women", "black high-waist short

In [7]:
queries = search_query_results.final_answer

In [8]:
query = queries[0]

In [9]:
query

'oversized gray sweater women'

In [14]:
query = "red shirt retro vibe"

In [24]:
BAD_KEYWORDS = [
    'support.google',
    'accounts.google',
    'maps.google'
]

In [25]:
import re

import mechanicalsoup


# Connect to Google
browser = mechanicalsoup.StatefulBrowser()
browser.open("https://www.google.com/")

# Fill-in the form
browser.select_form('form[action="/search"]')
browser["q"] = f'{query} "price" "add to cart"' 
# Note: the button name is btnK in the content served to actual
# browsers, but btnG for bots.
browser.submit_selected(btnName="btnG")
target_urls = []
# Display links
for link in browser.links():
    target = link.attrs['href']
    # Filter-out unrelated links and extract actual URL from Google's
    # click-tracking.
    if (target.startswith('/url?') and not
            target.startswith("/url?q=http://webcache.googleusercontent.com")):
        target = re.sub(r"^/url\?q=([^&]*)&.*", r"\1", target)
        print(target)
        if target.startswith('https') and not any(i in target for i in BAD_KEYWORDS):
            target_urls.append(target)

/search%3Fq%3Dred%2Bshirt%2Bretro%2Bvibe%2B%2522price%2522%2B%2522add%2Bto%2Bcart%2522%26sca_esv%3D7c704608065062ea%26hl%3Den-CA%26gbv%3D1%26ie%3DUTF-8%26tbm%3Dshop%26source%3Dlnms%26ved%3D1t:200713%26ictx%3D111
https://maps.google.com/maps%3Fhl%3Den-CA%26q%3Dred%2Bshirt%2Bretro%2Bvibe%2B%2522price%2522%2B%2522add%2Bto%2Bcart%2522%26iflsig%3DAL9hbdgAAAAAZwCmjCqN3GL4fAnKBbWJnrHTVrbdWZ1f%26gbv%3D1%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111
https://www.etsy.com/ca/listing/1026940735/retro-shirt-good-vibes-shirt-peace-shirt
https://www.differentstreamsrecords.com/product-page/vibe-up-unisex-jersey-short-sleeve-tee-4
https://www.hoodieisland.com/products/slowdive-vintage-vibe-red-logo-t-shirt
https://www.etsy.com/dk-en/listing/1036574717/the-bad-batch-shirt-retro-sixties-vibe%3Fref%3Dap-listing
https://www.amazon.com/Rainbow-Yellow-Retro-Premium-T-Shirt/dp/B07SXDRDSH
https://aspen-company.com/collections/retro-vibes%3Fsrsltid%3DAfmBOoqDmrZKpEQNIDY3_bdZJj0HgI9UxGHnmGmaJqBtZAVlygIkZ

In [26]:
target_urls

['https://www.etsy.com/ca/listing/1026940735/retro-shirt-good-vibes-shirt-peace-shirt',
 'https://www.differentstreamsrecords.com/product-page/vibe-up-unisex-jersey-short-sleeve-tee-4',
 'https://www.hoodieisland.com/products/slowdive-vintage-vibe-red-logo-t-shirt',
 'https://www.etsy.com/dk-en/listing/1036574717/the-bad-batch-shirt-retro-sixties-vibe%3Fref%3Dap-listing',
 'https://www.amazon.com/Rainbow-Yellow-Retro-Premium-T-Shirt/dp/B07SXDRDSH',
 'https://aspen-company.com/collections/retro-vibes%3Fsrsltid%3DAfmBOoqDmrZKpEQNIDY3_bdZJj0HgI9UxGHnmGmaJqBtZAVlygIkZDOW',
 'https://horsevibes.ca/en/products/chandail-a-manche-courte-rouge-adulte',
 'https://www.amazon.com/VINTAGE-LOBSTER-UNIQUE-MARINE-T-Shirt/dp/B09ZRPHBDC',
 'https://vibecustomshirts.com/collections/sale%3Fsrsltid%3DAfmBOop7U3oQvNhD4vI9Bk5k5U7MDg_pRnIyfEpVFRI4u7eu9wM7L_gW',
 'https://www.ebay.ca/itm/123828896185%3Fhash%3Ditem1cd4c701b9:g:l7UAAOSwDm1dI2NL%26var%3D424523403196']

In [56]:
from bs4 import BeautifulSoup
import requests

def url_to_html(url):
    # Fetch the HTML content from the URL
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch URL: {response.status_code}")
        return None

def clean_html(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove unuseful parts of the HTML
    for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
        tag.decompose()  # Completely remove the tag and its contents

    # Optionally, remove specific classes or IDs
    unuseful_classes = ['ad-banner', 'popup', 'newsletter-signup']
    unuseful_ids = ['sidebar', 'comments', 'related-articles']

    for cls in unuseful_classes:
        for tag in soup.find_all(class_=cls):
            tag.decompose()

    for id_ in unuseful_ids:
        for tag in soup.find_all(id=id_):
            tag.decompose()

    # Return the cleaned HTML
    return soup.prettify()

In [57]:
url = target_urls[1]

In [58]:
html = url_to_html(url)

In [59]:
len(html)

1170398

In [62]:
simple_html = clean_html(html)

In [63]:
len(simple_html)

37172

In [66]:
# print(simple_html)

In [69]:
client

<openai.OpenAI at 0x7fb490545630>

In [70]:
from pydantic import BaseModel, Field
from openai import OpenAI
from typing import Union
from enum import Enum

# # Create an instance of the OpenAI client to interact with the API
# client = OpenAI()

# Enum to represent extraction status
class ExtractionStatus(str, Enum):
    SUCCESS = "success"
    FAIL = "fail"

# Define the simplified Pydantic model for structured data extraction
# This model defines the fields that we want to extract from the HTML input
class ProductExtraction(BaseModel):
    product_name: str  # Product name
    product_description: str  # Product description
    material: str = None  # Material details (optional, may not apply to all products)
    dimensions_size: str = None  # Product dimensions or size (e.g., size, length, height, weight)
    price: float  # Product price
    currency: str  # Currency of the price (e.g., USD)
    availability: str  # Availability status (e.g., InStock)
    image_url: str  # URL of the product image
    category: str  # Product category (e.g., fashion, home decor)
    color: str = None  # Color option of the product (optional)

# Wrapper class to handle extraction result, including status and failure reasons
class HtmlExtraction(BaseModel):
    status: ExtractionStatus  # Status of the extraction
    product: Union[ProductExtraction, None] = None  # Product data if extraction is successful
    fail_reason: str = None  # Reason for failure, if applicable

# System prompt for structured data extraction
# This prompt instructs the model on how to process the provided HTML text
system_prompt = (
    "You are an expert in extracting structured e-commerce data. "
    "You will be provided with HTML text from an e-commerce website. "
    "Your task is to parse the HTML and extract details such as product name, price, image, and other specifications "
    "into a predefined structure."
)

# Define the completion call with the structured output response format
# This makes a request to the OpenAI API to parse the HTML and extract data using the defined schema
completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",  # Specify the model to use
    messages=[
        {"role": "system", "content": system_prompt},  # System-level prompt for guidance
        {"role": "user", "content": simple_html}  # User-provided HTML content for parsing
    ],
    response_format=HtmlExtraction,  # Specify the response format to match the HtmlExtraction model
)

# Accessing the extracted product details from the completion response
extraction_result = completion.choices[0].message.parsed

# Check the extraction status and print appropriate information
if extraction_result.status == ExtractionStatus.SUCCESS:
    print(extraction_result.product)
else:
    print(f"Extraction failed: {extraction_result.fail_reason}")

product_name='Vibe Up Red Tee' product_description="This classic unisex jersey short sleeve tee fits like a well-loved favorite. Soft cotton and quality print make users fall in love with it over and over again. These t-shirts have-ribbed knit collars to bolster shaping. The shoulders are tapered for a better fit over time. Dual side seams hold the garment's shape for longer. .: 100% Airlume combed and ringspun cotton (fiber content may vary for different colors).: Light fabric (4.2 oz/yd² (142 g/m²)).: Retail fit.: Tear away label.: Runs true to size" material='100% Airlume combed and ringspun cotton' dimensions_size='Retail fit' price=20.0 currency='USD' availability='InStock' image_url='https://static.wixstatic.com/media/afcd16_5d7cc0e0f2cc41328cace19e24b204b7~mv2.jpg/v1/fit/w_500,h_500,q_90/file.jpg' category='Clothing' color='Red'


In [73]:
url

'https://www.differentstreamsrecords.com/product-page/vibe-up-unisex-jersey-short-sleeve-tee-4'

In [72]:
extraction_result.dict()

{'status': <ExtractionStatus.SUCCESS: 'success'>,
 'product': {'product_name': 'Vibe Up Red Tee',
  'product_description': "This classic unisex jersey short sleeve tee fits like a well-loved favorite. Soft cotton and quality print make users fall in love with it over and over again. These t-shirts have-ribbed knit collars to bolster shaping. The shoulders are tapered for a better fit over time. Dual side seams hold the garment's shape for longer. .: 100% Airlume combed and ringspun cotton (fiber content may vary for different colors).: Light fabric (4.2 oz/yd² (142 g/m²)).: Retail fit.: Tear away label.: Runs true to size",
  'material': '100% Airlume combed and ringspun cotton',
  'dimensions_size': 'Retail fit',
  'price': 20.0,
  'currency': 'USD',
  'availability': 'InStock',
  'image_url': 'https://static.wixstatic.com/media/afcd16_5d7cc0e0f2cc41328cace19e24b204b7~mv2.jpg/v1/fit/w_500,h_500,q_90/file.jpg',
  'category': 'Clothing',
  'color': 'Red'},
 'fail_reason': ''}