# Html extraction

In [None]:
from unstructured.partition.auto import partition

# Read HTML file while preserving line numbers
with open("ht-eb-fa-1.html", "r", encoding="utf-8") as f:
    lines = f.readlines()

# Extract elements using unstructured.partition.auto
elements = partition(filename="ht-eb-fa-1.html", include_metadata=True)

# Store extracted text with line numbers
line_text_map = []

# Map extracted text to original line numbers
for element in elements:
    text = element.text.strip()
    if text:  # Ignore empty elements
        for i, line in enumerate(lines, start=1):
            if text in line:
                line_text_map.append((i, text))
                break  # Stop at the first occurrence

# Sort by line number in ascending order
line_text_map.sort()

# Print the results
for line_num, text in line_text_map:
    print(f"Line {line_num}: {text}")


# Zero-shot Prompt

### 1. Llama

In [None]:
from openai import OpenAI
import pandas as pd
import time

client = OpenAI(base_url="", api_key="")
html_data = pd.read_excel("html_data.xlsx")
html_inputs = html_data['html'].to_list()

# Define category-specific attribute sets
category_attributes = {
    "fashion": ["Object", "Brand", "Color", "Size", "Material", "Department", "Style", "Price"],
    "electronics": ["Object", "Brand", "Color", "Size", "Material", "Model", "Power Mode", "Price"],
    "beauty": ["Object", "Brand", "Volume", "Material", "Skin/Hair Type", "Benefits", "Price"]
}


# Helper function to process each category
def process_category(category, html_content):
    attributes = category_attributes.get(category, [])
    if not attributes:
        return "No valid attributes for this category."
    
    response = client.chat.completions.create(
        model="meta-llama/llama-3.2-11b-vision-instruct:free",
        messages=[
            {"role": "system", "content": "You are a world-class algorithm for extracting provided product attributes from html in structured formats, strictly exclude any unrelated information."}
        ] + [
            {"role": "user", "content": f"Extract the product attribute values from the html in a JSON format. Valid attributes are {', '.join(attributes)}. If an attribute is not present in the html, the attribute value is supposed to be ‘n/a’:\n{html_content}\n\nResponse:"}
        ]
    )
    
    if response.choices and response.choices[0].message and response.choices[0].message.content:
        return response.choices[0].message.content
    elif hasattr(response, "error"):
        return f"API Error: {response.error}"
    else:
        return "No content returned from the API."

# Main loop for processing the HTML inputs
for i, row in html_data.iterrows():
    with open(row["html"], "r", encoding="utf-8") as file:
        html_content = file.read()

    # Process based on category
    if row["category"] in category_attributes:
        result = process_category(row["category"], html_content)
        html_data.at[i, 'llama_zeroshot'] = result
        print(result)
    else:
        html_data.at[i, 'llama_zeroshot'] = "No valid category."
    time.sleep(20)

print(html_data["llama_zeroshot"])

### 2. Mistral

In [None]:
from openai import OpenAI
import pandas as pd
import time

client = OpenAI(base_url="", api_key="")
html_data = pd.read_excel("html_data.xlsx")
html_inputs = html_data['html'].to_list()

# Define category-specific attribute sets
category_attributes = {
    "fashion": ["Object", "Brand", "Color", "Size", "Material", "Department", "Style", "Price"],
    "electronics": ["Object", "Brand", "Color", "Size", "Material", "Model", "Power Mode", "Price"],
    "beauty": ["Object", "Brand", "Volume", "Material", "Skin/Hair Type", "Benefits", "Price"]
}

# Helper function to process each category
def process_category(category, html_content):
    attributes = category_attributes.get(category, [])
    if not attributes:
        return "No valid attributes for this category."
    
    response = client.chat.completions.create(
        model="mistralai/mistral-small-3.1-24b-instruct:free",
        messages=[
            {"role": "system", "content": "You are a world-class algorithm for extracting provided product attributes from html in structured formats, strictly exclude any unrelated information."}
        ] + [
            {"role": "user", "content": f"Extract the product attribute values from the html in a JSON format. Valid attributes are {', '.join(attributes)}. If an attribute is not present in the html, the attribute value is supposed to be ‘n/a’:\n{html_content}\n\nResponse:"}
        ]
    )
    
    if response.choices and response.choices[0].message and response.choices[0].message.content:
        return response.choices[0].message.content
    elif hasattr(response, "error"):
        return f"API Error: {response.error}"
    else:
        return "No content returned from the API."

# Main loop for processing the HTML inputs
for i, row in html_data.iterrows():
    with open(row["html"], "r", encoding="utf-8") as file:
        html_content = file.read()

    # Process based on category
    if row["category"] in category_attributes:
        result = process_category(row["category"], html_content)
        html_data.at[i, 'mistral_zeroshot'] = result
        print(result)
    else:
        html_data.at[i, 'mistral_zeroshot'] = "No valid category."
    time.sleep(20)

print(html_data["mistral_zeroshot"])

### 3. Qwen

In [None]:
from openai import OpenAI
import pandas as pd
import time

client = OpenAI(base_url="", api_key="")
html_data = pd.read_excel("html_data.xlsx")
html_inputs = html_data['html'].to_list()

# Define category-specific attribute sets
category_attributes = {
    "fashion": ["Object", "Brand", "Color", "Size", "Material", "Department", "Style", "Price"],
    "electronics": ["Object", "Brand", "Color", "Size", "Material", "Model", "Power Mode", "Price"],
    "beauty": ["Object", "Brand", "Volume", "Material", "Skin/Hair Type", "Benefits", "Price"]
}

# Helper function to process each category
def process_category(category, html_content):
    attributes = category_attributes.get(category, [])
    if not attributes:
        return "No valid attributes for this category."
    
    response = client.chat.completions.create(
        model="qwen/qwen2.5-vl-72b-instruct:free",
        messages=[
            {"role": "system", "content": "You are a world-class algorithm for extracting provided product attributes from html in structured formats, strictly exclude any unrelated information."}
        ] + [
            {"role": "user", "content": f"Extract the product attribute values from the html in a JSON format. Valid attributes are {', '.join(attributes)}. If an attribute is not present in the html, the attribute value is supposed to be ‘n/a’:\n{html_content}\n\nResponse:"}
        ]
    )
    
    if response.choices and response.choices[0].message and response.choices[0].message.content:
        return response.choices[0].message.content
    elif hasattr(response, "error"):
        return f"API Error: {response.error}"
    else:
        return "No content returned from the API."

# Main loop for processing the HTML inputs
for i, row in html_data.iterrows():
    with open(row["html"], "r", encoding="utf-8") as file:
        html_content = file.read()

    # Process based on category
    if row["category"] in category_attributes:
        result = process_category(row["category"], html_content)
        html_data.at[i, 'qwen_zeroshot'] = result
        print(result)
    else:
        html_data.at[i, 'qwen_zeroshot'] = "No valid category."
    time.sleep(20)

print(html_data["qwen_zeroshot"])

In [None]:
html_data.to_excel("html_data.xlsx", index = False)

# Few-shot Prompt

## Few-shot by platform & category

### 1. Llama

In [None]:
import pandas as pd
from openai import OpenAI
import json

# Initialize OpenAI client
client = OpenAI(base_url="", api_key="")

# Load the HTML data from the Excel file
html_data = pd.read_excel("html_data.xlsx")

# Filter the test data from the dataset
test_data = html_data[html_data['set'] == 'test']

# Define category-specific attribute sets
category_attributes = {
    "fashion": ["Object", "Brand", "Color", "Size", "Material", "Department", "Style", "Price"],
    "electronics": ["Object", "Brand", "Color", "Size", "Material", "Model", "Power Mode", "Price"],
    "beauty": ["Object", "Brand", "Volume", "Material", "Skin/Hair Type", "Benefits", "Price"]
}

platforms = ["amazon", "ebay", "temu"]

# Helper function to generate few-shot examples based on the "train" set data
def generate_few_shot_examples(category, platform, data):
    examples = []
    
    # Filter data based on category and platform
    filtered_data = data[(data['category'] == category) & (data['platform'] == platform) & (data['set'] == 'train')]
    
    for _, row in filtered_data.iterrows():
        html_file_path = row['html']  # Path to the HTML file
        
        # Read the HTML content from the file
        try:
            with open(html_file_path, "r", encoding="utf-8") as file:
                html_content = file.read()
        except FileNotFoundError:
            print(f"File not found: {html_file_path}")
            html_content = ""
        
        reference_output = row['reference_output']  # Correct answer (reference output)
        
        # Construct the few-shot example
        example = {
            "role": "system",
            "content": "You are an algorithm that extracts product attributes from HTML in a structured format. Only extract the specified attributes and exclude unrelated information."
        }
        
        user_content = f"Extract product attributes from the HTML. Valid attributes are {', '.join(category_attributes.get(category, []))}. If an attribute is not present in the HTML, the value should be 'n/a'.\n\nHTML content:\n{html_content}\n\nReference Output:\n{reference_output}"
        assistant_content = reference_output  # The correct response to match the attributes
        
        examples.append({"role": "user", "content": user_content})
        examples.append({"role": "assistant", "content": assistant_content})
    
    return examples

# Helper function to process each category/platform with few-shot examples
def process_category_platform(category, platform, html_content):
    examples = generate_few_shot_examples(category, platform, html_data)
    
    if not examples:
        return "No valid few-shot examples for this category/platform."
    
    response = client.chat.completions.create(
        model="meta-llama/llama-3.2-11b-vision-instruct:free",  # Specify the model you want to use
        messages=examples + [
            {"role": "user", "content": f"Extract the product attribute values from the HTML in a JSON format. Valid attributes are {', '.join(category_attributes[category])}:\n{html_content}\n\nResponse:"}
        ]
    )
    
    if response.choices and response.choices[0].message and response.choices[0].message.content:
        return response.choices[0].message.content
    elif hasattr(response, "error"):
        return f"API Error: {response.error}"
    else:
        return "No content returned from the API."

# Main loop for processing the test set HTML inputs
for i, row in test_data.iterrows():
    html_file_path = row["html"]  # Get the path to the HTML file

    # Read the HTML content from the file
    try:
        with open(html_file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Process based on category and platform
        if row["category"] in category_attributes and row["platform"] in platforms:
            result = process_category_platform(row["category"], row["platform"], html_content)
            test_data.at[i, 'llama_fewshot'] = result  # Write result to 'llama_fewshot' column
            print(result)
        else:
            test_data.at[i, 'llama_fewshot'] = "No valid category."
    except FileNotFoundError:
        test_data.at[i, 'llama_fewshot'] = f"HTML file not found: {html_file_path}"
        print(f"HTML file not found: {html_file_path}")

# Write the results back to the original dataframe
html_data.update(test_data[['html', 'llama_fewshot']])


### 2. Mistral

In [None]:
import pandas as pd
from openai import OpenAI
import json

# Initialize OpenAI client
client = OpenAI(base_url="", api_key="")

# Load the HTML data from the Excel file
html_data = pd.read_excel("html_data.xlsx")

# Filter the test data from the dataset
test_data = html_data[html_data['set'] == 'test']

# Define category-specific attribute sets
category_attributes = {
    "fashion": ["Object", "Brand", "Color", "Size", "Material", "Department", "Style", "Price"],
    "electronics": ["Object", "Brand", "Color", "Size", "Material", "Model", "Power Mode", "Price"],
    "beauty": ["Object", "Brand", "Volume", "Material", "Skin/Hair Type", "Benefits", "Price"]
}

platforms = ["amazon", "ebay", "temu"]

# Helper function to generate few-shot examples based on the "train" set data
def generate_few_shot_examples(category, platform, data):
    examples = []
    
    # Filter data based on category and platform
    filtered_data = data[(data['category'] == category) & (data['platform'] == platform) & (data['set'] == 'train')]
    
    for _, row in filtered_data.iterrows():
        html_file_path = row['html']  # Path to the HTML file
        
        # Read the HTML content from the file
        try:
            with open(html_file_path, "r", encoding="utf-8") as file:
                html_content = file.read()
        except FileNotFoundError:
            print(f"File not found: {html_file_path}")
            html_content = ""
        
        reference_output = row['reference_output']  # Correct answer (reference output)
        
        # Construct the few-shot example
        example = {
            "role": "system",
            "content": "You are an algorithm that extracts product attributes from HTML in a structured format. Only extract the specified attributes and exclude unrelated information."
        }
        
        user_content = f"Extract product attributes from the HTML. Valid attributes are {', '.join(category_attributes.get(category, []))}. If an attribute is not present in the HTML, the value should be 'n/a'.\n\nHTML content:\n{html_content}\n\nReference Output:\n{reference_output}"
        assistant_content = reference_output  # The correct response to match the attributes
        
        examples.append({"role": "user", "content": user_content})
        examples.append({"role": "assistant", "content": assistant_content})
    
    return examples

# Helper function to process each category/platform with few-shot examples
def process_category_platform(category, platform, html_content):
    examples = generate_few_shot_examples(category, platform, html_data)
    
    if not examples:
        return "No valid few-shot examples for this category/platform."
    
    response = client.chat.completions.create(
        model="mistralai/mistral-small-3.1-24b-instruct:free",  # Specify the model you want to use
        messages=examples + [
            {"role": "user", "content": f"Extract the product attribute values from the HTML in a JSON format. Valid attributes are {', '.join(category_attributes[category])}:\n{html_content}\n\nResponse:"}
        ]
    )
    
    if response.choices and response.choices[0].message and response.choices[0].message.content:
        return response.choices[0].message.content
    elif hasattr(response, "error"):
        return f"API Error: {response.error}"
    else:
        return "No content returned from the API."

# Main loop for processing the test set HTML inputs
for i, row in test_data.iterrows():
    html_file_path = row["html"]  # Get the path to the HTML file

    # Read the HTML content from the file
    try:
        with open(html_file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Process based on category and platform
        if row["category"] in category_attributes and row["platform"] in platforms:
            result = process_category_platform(row["category"], row["platform"], html_content)
            test_data.at[i, 'mistral_fewshot'] = result  # Write result to 'mistral_fewshot' column
            print(result)
        else:
            test_data.at[i, 'mistral_fewshot'] = "No valid category."
    except FileNotFoundError:
        test_data.at[i, 'mistral_fewshot'] = f"HTML file not found: {html_file_path}"
        print(f"HTML file not found: {html_file_path}")

# Write the results back to the original dataframe
html_data.update(test_data[['html', 'mistral_fewshot']])


### 3. Qwen

In [None]:
import pandas as pd
from openai import OpenAI
import json

# Initialize OpenAI client
client = OpenAI(base_url="", api_key="")

# Load the HTML data from the Excel file
html_data = pd.read_excel("html_data.xlsx")

# Filter the test data from the dataset
test_data = html_data[html_data['set'] == 'test']

# Define category-specific attribute sets
category_attributes = {
    "fashion": ["Object", "Brand", "Color", "Size", "Material", "Department", "Style", "Price"],
    "electronics": ["Object", "Brand", "Color", "Size", "Material", "Model", "Power Mode", "Price"],
    "beauty": ["Object", "Brand", "Volume", "Material", "Skin/Hair Type", "Benefits", "Price"]
}

platforms = ["amazon", "ebay", "temu"]

# Helper function to generate few-shot examples based on the "train" set data
def generate_few_shot_examples(category, platform, data):
    examples = []
    
    # Filter data based on category and platform
    filtered_data = data[(data['category'] == category) & (data['platform'] == platform) & (data['set'] == 'train')]
    
    for _, row in filtered_data.iterrows():
        html_file_path = row['html']  # Path to the HTML file
        
        # Read the HTML content from the file
        try:
            with open(html_file_path, "r", encoding="utf-8") as file:
                html_content = file.read()
        except FileNotFoundError:
            print(f"File not found: {html_file_path}")
            html_content = ""
        
        reference_output = row['reference_output']  # Correct answer (reference output)
        
        # Construct the few-shot example
        example = {
            "role": "system",
            "content": "You are an algorithm that extracts product attributes from HTML in a structured format. Only extract the specified attributes and exclude unrelated information."
        }
        
        user_content = f"Extract product attributes from the HTML. Valid attributes are {', '.join(category_attributes.get(category, []))}. If an attribute is not present in the HTML, the value should be 'n/a'.\n\nHTML content:\n{html_content}\n\nReference Output:\n{reference_output}"
        assistant_content = reference_output  # The correct response to match the attributes
        
        examples.append({"role": "user", "content": user_content})
        examples.append({"role": "assistant", "content": assistant_content})
    
    return examples

# Helper function to process each category/platform with few-shot examples
def process_category_platform(category, platform, html_content):
    examples = generate_few_shot_examples(category, platform, html_data)
    
    if not examples:
        return "No valid few-shot examples for this category/platform."
    
    response = client.chat.completions.create(
        model="qwen/qwen2.5-vl-72b-instruct:free",  # Specify the model you want to use
        messages=examples + [
            {"role": "user", "content": f"Extract the product attribute values from the HTML in a JSON format. Valid attributes are {', '.join(category_attributes[category])}:\n{html_content}\n\nResponse:"}
        ]
    )
    
    if response.choices and response.choices[0].message and response.choices[0].message.content:
        return response.choices[0].message.content
    elif hasattr(response, "error"):
        return f"API Error: {response.error}"
    else:
        return "No content returned from the API."

# Main loop for processing the test set HTML inputs
for i, row in test_data.iterrows():
    html_file_path = row["html"]  # Get the path to the HTML file

    # Read the HTML content from the file
    try:
        with open(html_file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Process based on category and platform
        if row["category"] in category_attributes and row["platform"] in platforms:
            result = process_category_platform(row["category"], row["platform"], html_content)
            test_data.at[i, 'qwen_fewshot'] = result  # Write result to 'qwen_fewshot' column
            print(result)
        else:
            test_data.at[i, 'qwen_fewshot'] = "No valid category."
    except FileNotFoundError:
        test_data.at[i, 'qwen_fewshot'] = f"HTML file not found: {html_file_path}"
        print(f"HTML file not found: {html_file_path}")

# Write the results back to the original dataframe
html_data.update(test_data[['html', 'qwen_fewshot']])


In [None]:
html_data.to_json("html_data.json")

## Few-shot all training data

In [None]:
import pandas as pd
from openai import OpenAI
import json

# Initialize OpenAI client
client = OpenAI(base_url="", api_key="")

# Load the HTML data from the Excel file
html_data = pd.read_excel("html_data.xlsx")

# Filter the test data from the dataset
test_data = html_data[html_data['set'] == 'test']

# Define category-specific attribute sets
category_attributes = {
    "fashion": "Object, Brand, Color, Size, Material, Department, Style, Price",
    "electronics": "Object, Brand, Color, Size, Material, Model, Power Mode, Price",
    "beauty": "Object, Brand, Volume, Material, Skin/Hair Type, Benefits, Price"
}

platforms = ["amazon", "ebay", "temu"]

# Helper function to generate few-shot examples based on the "train" set data
def generate_few_shot_examples(data):
    examples_by_category = {category: [] for category in category_attributes.keys()}
    
    # Filter data based on category and platform
    for category in category_attributes.keys():
        for platform in platforms:
            filtered_data = data[(data['category'] == category) & (data['platform'] == platform) & (data['set'] == 'train')]
            
            for _, row in filtered_data.iterrows():
                html_file_path = row['html']  # Path to the HTML file
                
                # Read the HTML content from the file
                try:
                    with open(html_file_path, "r", encoding="utf-8") as file:
                        html_content = file.read()
                except FileNotFoundError:
                    print(f"File not found: {html_file_path}")
                    html_content = ""
                
                reference_output = row['reference_output']  # Correct answer (reference output)
                
                # Construct the few-shot example
                example = {
                    "role": "system",
                    "content": "You are an algorithm that extracts product attributes from HTML in a structured format. Only extract the specified attributes and exclude unrelated information."
                }
                
                user_content = f"Extract product attributes from the HTML. Valid attributes are {', '.join(category_attributes.get(category, []))}. If an attribute is not present in the HTML, the value should be 'n/a'.\n\nHTML content:\n{html_content}\n\nReference Output:\n{reference_output}"
                assistant_content = reference_output  # The correct response to match the attributes
                
                examples_by_category[category].append({"role": "user", "content": user_content})
                examples_by_category[category].append({"role": "assistant", "content": assistant_content})
    
    return examples_by_category

# Generate few-shot examples for each category
few_shot_examples_by_category = generate_few_shot_examples(html_data)

# Helper function to process each test row with the appropriate few-shot examples for its category
def process_with_category_examples(category, html_content):
    if category not in few_shot_examples_by_category:
        return "No valid few-shot examples for this category."

    examples = few_shot_examples_by_category[category]

    response = client.chat.completions.create(
        model="mistralai/mistral-small-3.1-24b-instruct:free",  # Specify the model you want to use
        messages=examples + [
            {"role": "user", "content": f"Extract the product attribute values from the HTML in a JSON format. Valid attributes are {', '.join(category_attributes[category])}:\n{html_content}\n\nResponse:"}
        ]
    )

    if response.choices and response.choices[0].message and response.choices[0].message.content:
        return response.choices[0].message.content
    elif hasattr(response, "error"):
        return f"API Error: {response.error}"
    else:
        return "No content returned from the API."

# Main loop for processing the test set HTML inputs
for i, row in test_data.iterrows():
    html_file_path = row["html"]  # Get the path to the HTML file

    # Read the HTML content from the file
    try:
        with open(html_file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Process using the few-shot examples for the correct category
        category = row["category"]
        result = process_with_category_examples(category, html_content)
        test_data.at[i, 'llama_fewshot'] = result  # Write result to 'llama_fewshot' column
        print(result)
        
    except FileNotFoundError:
        test_data.at[i, 'llama_fewshot'] = f"HTML file not found: {html_file_path}"
        print(f"HTML file not found: {html_file_path}")

# Write the results back to the original dataframe
html_data.update(test_data[['html', 'llama_fewshot']])