<a href="https://colab.research.google.com/github/Muhtasham/COSINE/blob/main/expand_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install instructor openai -q

In [8]:
import requests
from bs4 import BeautifulSoup

# Fetch the webpage content
url = "https://en.realonda.com/collection/dakhla/"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')

In [12]:
from rich import print as rprint
rprint(soup)

In [10]:
from pydantic import BaseModel, Field
from typing import List, Optional, Any

class Supplier(BaseModel):
    name: str = Field(..., description="The name of the supplier providing the floor tile product")
    url: str = Field(..., description="The website URL of the floor tile supplier")
    logo: Optional[str] = Field(None, description="The logo URL of the floor tile supplier")
    address: Optional[str] = Field(None, description="The physical address of the floor tile supplier")
    email: Optional[str] = Field(None, description="The email address of the floor tile supplier")
    phone: Optional[str] = Field(None, description="The phone number of the floor tile supplier")
    description: Optional[str] = Field(None, description="A description of the floor tile supplier")

class Product(BaseModel):
    category: str = Field("tiles", description="The category of the product")
    name: str = Field(..., description="The name of the floor tile product")
    url: str = Field(..., description="The URL of the product page")
    supplier_name: str = Field(..., description="The name of the supplier providing the floor tile product")
    product_collection_name: Optional[str] = Field(None, description="The name of the collection to which the product belongs")
    description: Optional[str] = Field(None, description="A description of the floor tile product")
    price: Optional[float] = Field(None, description="The price of the floor tile product")
    kg_per_box: Optional[List[float]] = Field(None, description="The weight in kilograms per box for different sizes")
    m2_per_box: Optional[List[float]] = Field(None, description="The square meters per box for different sizes")
    dimensions_cm: Optional[List[str]] = Field(None, description="The dimensions of the product in centimeters")
    dimensions_inch: Optional[List[str]] = Field(None, description="The dimensions of the product in inches")
    pieces_per_box: Optional[List[int]] = Field(None, description="The number of pieces per box for different sizes")
    kg_per_pallet: Optional[List[float]] = Field(None, description="The weight in kilograms per pallet")
    kg_per_europallet: Optional[List[float]] = Field(None, description="The weight in kilograms per europallet")
    m2_per_pallet: Optional[List[float]] = Field(None, description="The square meters per pallet")
    m2_per_europallet: Optional[List[float]] = Field(None, description="The square meters per europallet")
    boxes_per_pallet: Optional[List[int]] = Field(None, description="The number of boxes per pallet")
    boxes_per_europallet: Optional[List[int]] = Field(None, description="The number of boxes per europallet")
    main_image: Optional[str] = Field(None, description="The primary image URL showcasing the floor tile product")
    images: Optional[List[str]] = Field(None, description="An array of additional image URLs showcasing the floor tile product")
    pdfs: Optional[List[str]] = Field(None, description="An array of URLs to PDF files with more details about the floor tile product")
    additional_info: Optional[Any] = Field(None, description="Additional information about the product, including specifications")

class Schema(BaseModel):
    Supplier: List[Supplier]
    Product: List[Product]

# Now we can proceed with using instructor to extract the data
import instructor
from openai import OpenAI

# Define the OpenAI API key
openai_api_key = "sk-xxx"

# Patch the OpenAI client
client = instructor.from_openai(OpenAI(api_key=openai_api_key))

# Define messages to extract supplier and product data
messages = [
    {
        "role": "system",
        "content": "You are a world-class AI that excels at extracting structured data from a website according to a specified schema."
    },
    {
        "role": "user",
        "content": f"Please extract supplier and product data from the following HTML content: {soup}"
    }
]

# Extract data using the LLM
response = client.chat.completions.create(
    model="gpt-4o",
    response_model=Schema,
    messages=messages
)

# Output the extracted data
print(response)


Supplier=[Supplier(name='Realonda', url='https://en.realonda.com', logo='https://en.realonda.com/wp-content/themes/realonda/images/logo-alt.png', address=None, email=None, phone=None, description='Realonda is a Spanish manufacturer of ceramic and porcelain tiles, offering a variety of collections including the Dakhla collection.')] Product=[Product(category='tiles', name='DAKHLA TERRACOTTA', url='https://en.realonda.com/collection/dakhla/', supplier_name='Realonda', product_collection_name='Dakhla', description='DAKHLA TERRACOTTA is part of the Dakhla collection, featuring a matte finish and available in the size 31 x 56 cm · 12” x 22”.', price=None, kg_per_box=[25.0], m2_per_box=[1.21], dimensions_cm=['31 x 56 cm'], dimensions_inch=['12” x 22”'], pieces_per_box=[7], kg_per_pallet=[1210.0], kg_per_europallet=[915.0], m2_per_pallet=[58.08], m2_per_europallet=[43.56], boxes_per_pallet=[48], boxes_per_europallet=[36], main_image='https://en.realonda.com/wp-content/uploads/sites/2/fly-imag

In [11]:
rprint(response)