In [4]:
import requests
from bs4 import BeautifulSoup
import os
import urllib.parse

CONTENT_PATH = "demos/demo_v0/reources/sellercenter_crawl/htmls"
SRC_URL_FOR_SELLER_HOWTO = "https://www.ebay.com/sellercenter/selling"

def download_links_with_keyword(url, target_dir, keyword="sellercenter"):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    crawled_links = set()  # Set to keep track of crawled URLs
    filename_to_link = {}
    
    try:
        # Send a request to get the HTML content of the page
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Ensure we get a valid response
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all links containing the keyword
        links = [a['href'] for a in soup.find_all('a', href=True) if keyword in a['href']]

        # Create the target directory if it doesn't exist
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)

        for link in links:
            # Check if link is already absolute
            if not link.startswith("http"):
                link = "https://www.ebay.com/" + link.lstrip('/')

            # Check if link has been crawled already
            if link in crawled_links:
                print(f"Skipped (already crawled): {link}")
                continue

            # Extract the last part of the URL as the file name
            parsed_url = urllib.parse.urlparse(link)
            file_name = urllib.parse.quote(parsed_url.path.split('/')[-1]) or 'index'  # Handle cases with trailing slashes

            # Construct the file path
            file_path = os.path.join(target_dir, f"{file_name}.html")

            # Download the content of each link
            try:
                file_response = requests.get(link, headers=headers, timeout=10)
                file_response.raise_for_status()
                filename_to_link[file_path] = link

                with open(file_path, 'wb') as file:
                    file.write(file_response.content)

                print(f"Downloaded: {link} -> {file_path}")
                crawled_links.add(link)  # Mark this link as crawled

            except requests.exceptions.RequestException as e:
                print(f"Failed to download {link}: {e}")

    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve the main page: {e}")

    return filename_to_link

In [5]:
# Usage
filename_to_link = download_links_with_keyword(SRC_URL_FOR_SELLER_HOWTO, CONTENT_PATH)

Downloaded: https://www.ebay.com/sellercenter -> demos/demo_v0/reources/sellercenter_crawl/htmls/sellercenter.html
Downloaded: https://www.ebay.com/sellercenter/selling -> demos/demo_v0/reources/sellercenter_crawl/htmls/selling.html
Downloaded: https://www.ebay.com/sellercenter/selling/start-selling-on-ebay -> demos/demo_v0/reources/sellercenter_crawl/htmls/start-selling-on-ebay.html
Downloaded: https://www.ebay.com/sellercenter/selling/what-to-sell -> demos/demo_v0/reources/sellercenter_crawl/htmls/what-to-sell.html
Downloaded: https://www.ebay.com/sellercenter/selling/how-to-sell -> demos/demo_v0/reources/sellercenter_crawl/htmls/how-to-sell.html
Downloaded: https://www.ebay.com/sellercenter/selling/seller-fees -> demos/demo_v0/reources/sellercenter_crawl/htmls/seller-fees.html
Downloaded: https://www.ebay.com/sellercenter/payments-and-fees/payments-and-earnings -> demos/demo_v0/reources/sellercenter_crawl/htmls/payments-and-earnings.html
Downloaded: https://www.ebay.com/sellercenter

In [7]:
import pickle
LIB = "../demos/demo_v0/resources/sellercenter_crawl/"
with open(LIB + "filename_to_link", 'wb') as f:
    pickle.dump(filename_to_link, f)


# To complete the RAG, here is a code to index the webpages:

In [None]:
import os
import faiss
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer

# Load the Llama-based embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

def extract_text_from_html(file_path):
    """Extracts only relevant text content from an HTML file, filtering out links, tags, and other non-essential elements."""
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        
        # Remove unnecessary elements (e.g., scripts, styles, navigation links)
        for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form', 'noscript']):
            element.extract()
        
        # Extract text from main content tags
        content = []
        
        for tag in soup.find_all(['h1', 'h2', 'h3', 'p', 'li']):
            text = tag.get_text(separator=" ", strip=True)
            if text:  # Only add non-empty text
                content.append(text)
        
        # Join the extracted text with line breaks for readability
        relevant_text = "\n".join(content)
        
    return relevant_text

def generate_embeddings(texts):
    """Generates embeddings for a list of texts using the embedding model."""
    return embedder.encode(texts, convert_to_tensor=True)

def build_index(embeddings):
    """Builds a FAISS index for the provided embeddings."""
    dimension = embeddings.shape[1]  # Get embedding dimension
    index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity search
    index.add(embeddings)  # Add embeddings to the index
    return index

def prepare_documents_and_index(directory):
    """Prepares documents by reading HTML files, generating embeddings, and building an index."""
    documents = []
    filenames = []  # Store filenames associated with each document
    embeddings = []

    # Process each HTML file in the directory
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith('.html'):
            # Extract text content from the HTML file
            text = extract_text_from_html(file_path)
            print(f"Extracted content from {filename}:\n{text}\n")  # Print extracted content

            # Append the extracted text and filename to lists
            documents.append(text)
            filenames.append(filename)
            
            # Generate and store embedding
            embedding = generate_embeddings([text])[0]
            embeddings.append(embedding)

    # Stack embeddings into a single numpy array
    embeddings = torch.stack(embeddings).cpu().detach().numpy()

    # Build FAISS index
    index = build_index(embeddings)
    return documents, filenames, index

def search(query, documents, filenames, index, top_k=5):
    """Searches for the top-k most similar documents to the query."""
    # Generate embedding for the query
    query_embedding = generate_embeddings([query])[0].cpu().detach().numpy().reshape(1, -1)

    # Perform search in the FAISS index
    _, indices = index.search(query_embedding, top_k)

    # Retrieve the corresponding documents and filenames
    results = [{"text": documents[i], "filename": filenames[i]} for i in indices[0]]
    return results


In [None]:
# Usage
directory = "downloads_encoded_names"
documents, filenames, index = prepare_documents_and_index(directory)

# Code to answer the questions

In [None]:
from pychomsky.chchat import AzureOpenAIChatWrapper
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain

MODEL_NAME = "openai-chat-completions-gpt-3.5-turbo-0125"
#MODEL_NAME = "ebay-internal-chat-completions-ellama-3-8b-instruct"

def answer_question(question, filename_to_link, documents, filenames, index):
    results = search(question, documents, filenames, index)
    conversation = ConversationChain(
        llm=AzureOpenAIChatWrapper( model_name=MODEL_NAME), 
        memory=ConversationBufferMemory(),
        verbose=False
    )
    prompt = "\n".join(["Given the following retrieved paragraph from eBay's support page:\n",
            "Parapgraph",
            results[0]['text'], 
            "From the webpage:",
            filename_to_link[CONTENT_PATH + "/" + results[0]['filename']],
            "Summarize all the relevant data to answer the following questoin:",
            question,
            "and provide a link to the original webpage"])
    return conversation.predict(input=prompt)

In [None]:
question = "what happens if the buyer returns a good item?"
print(answer_question(question, filename_to_link, documents, filenames, index))

If the buyer returns a good item, the seller may still need to issue a refund to the buyer as per the return policy. The seller can issue the refund directly from the Return Details screen after inspecting the item. The specific details and process for handling a return can be found on eBay's support page for managing returns, which can be accessed through the following link: <https://www.ebay.com/sellercenter/protections/returns>.

In [1]:
import os
import faiss
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer


def extract_text_from_html(file_path):
    """Extracts only relevant text content from an HTML file, filtering out links, tags, and other non-essential elements."""
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

        # Remove unnecessary elements (e.g., scripts, styles, navigation links)
        for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form', 'noscript']):
            element.extract()

        # Extract text from main content tags
        content = []

        for tag in soup.find_all(['h1', 'h2', 'h3', 'p', 'li']):
            text = tag.get_text(separator=" ", strip=True)
            if text:  # Only add non-empty text
                content.append(text)

        # Join the extracted text with line breaks for readability
        relevant_text = "\n".join(content)

    return relevant_text


def generate_embeddings(texts):
    embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    """Generates embeddings for a list of texts using the embedding model."""
    return embedder.encode(texts, convert_to_tensor=True)


def build_faiss_index(embeddings):
    """Builds a FAISS index for the provided embeddings."""
    dimension = embeddings.shape[1]  # Get embedding dimension
    index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity search
    index.add(embeddings)  # Add embeddings to the index
    return index


def split_into_chunks(text, chunk_size=256, overlap=50):
    """
    Splits text into chunks of specified token length with overlap.

    Parameters:
        text (str): The text to split.
        chunk_size (int): Number of tokens per chunk.
        overlap (int): Number of tokens to overlap between chunks.

    Returns:
        list of str: List of text chunks.
    """
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []

    for i in range(0, len(tokens), chunk_size - overlap):
        chunk_tokens = tokens[i:i + chunk_size]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)

    return chunks



def build_index(directory_list):
    """Prepares documents by reading HTML files, generating embeddings, and building an index."""
    documents = []
    filenames = []  # Store filenames associated with each document
    embeddings = []

    # Process all directories.
    for directory in directory_list:
        # Process each HTML file in the directory
        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            if filename.endswith('.html'):
                # Extract text content from the HTML file
                text = extract_text_from_html(file_path)
                print(f"Extracted content from {filename}:\n{text}\n")  # Print extracted content

                # Split the text into chunks
                text_chunks = split_into_chunks(text)

                for chunk_index, chunk in enumerate(text_chunks):
                    print("handling chunk " , str(chunk_index))
                    # Append each chunk and metadata to lists
                    documents.append(chunk)
                    filenames.append((filename, chunk_index))  # Track filename and chunk index

                    # Generate and store embedding for each chunk
                    embedding = generate_embeddings([chunk])[0]
                    embeddings.append(embedding)

    # Stack embeddings into a single numpy array
    embeddings = torch.stack(embeddings).cpu().detach().numpy()

    # Build FAISS index
    index = build_faiss_index(embeddings)
    return documents, filenames, index

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
documents, filenames, index = build_index(["demos/demo_v0/reources/sellercenter_crawl"])

Extracted content from listings.html:
Create great listings
Listings are your window to millions of potential buyers on eBay.
Listing best practices
Get your listings looking great with these basic tips. Learn how to write effective titles and descriptions, select the best categories, and much more.
Make your listings stand out
Help your potential buyers find your listings when using search and filters, and learn how you can future-proof your listings with item specifics.
Take great photos
Successful listings start with great photos. Check out our tips and tricks for getting great product images for your eBay listings.
Offer competitive prices
eBay’s Research Tools help you price your items competitively and maximize sales.
Manage your listings
Find advanced listing tools and features to help you create, update, and manage listings as your business grows.

handling chunk  create great listings listings are your window to millions of potential buyers on ebay. listing best practices get 