In [None]:
# !pip install google-generativeai

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
import google.generativeai as genai
from urllib.parse import urljoin

Gemini provide free API for light use, see https://ai.google.dev/pricing for details

In [None]:
# Set up Gemini API key
# PLEASE GET YOUR OWN GEMINI API via https://ai.google.dev/
GEMINI_API_KEY = "YOUR OWN API KEY"

# Simple Task: extract info from static website

Step 1: 
- scrape content using `bs4`
- parse to html
- apply simple fiiter

In [None]:
def scrape_content(url):
    """
    Scrape main content from a webpage
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove unwanted elements
        for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
            tag.decompose()
            
        # Extract main content (customize based on website structure)
        content = soup.find('main') or soup.find('article') or soup.find('body')
        
        if content:
            return ' '.join(content.stripped_strings)
        return None
    
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None

Step 2: use LLM to get a summary

In [None]:
def summarize_with_gemini(text, api_key):
    """
    Generate summary using Gemini API
    """
    try:
        # Configure the Gemini API
        genai.configure(api_key=api_key)
        
        # Initialize the model (using the most capable model available)
        model = genai.GenerativeModel('gemini-pro')
        
        # Prepare the prompt
        prompt = f"""Please provide a concise summary of the following text in 2-3 sentences:
        
        {text[:30000]}"""  # Gemini has a higher token limit than GPT-3.5
        
        # Generate response
        response = model.generate_content(prompt)
        
        return response.text
    except Exception as e:
        print(f"Error with Gemini API: {str(e)}")
        return None

Step 3: 
- Type the website url and apply the code above to get info
- Summarise them into a table

In [None]:
def process_urls(urls, api_key):
    """
    Process multiple URLs and create a summary table
    """
    results = []
    
    for url in urls:
        print(f"Processing: {url}")
        content = scrape_content(url)
        
        if content:
            summary = summarize_with_gemini(content, api_key)
            
            results.append({
                'URL': url,
                'Summary': summary,
                'Content Length': len(content),
                'Timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
            })
            
            # Respect API rate limits
            time.sleep(1)
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(results)
    df.to_csv('web_summaries.csv', index=False)
    return df

Let's look at an example

In [None]:
# List of URLs to process
urls_to_process = [
    "http://www.drps.ed.ac.uk/24-25/dpt/cxcmse11615.htm",
    "http://www.drps.ed.ac.uk/24-25/dpt/cxcmse11427.htm"
]

In [None]:
# Process URLs and get results
summary_table = process_urls(urls_to_process, GEMINI_API_KEY)
print("\nSummary Table:")
print(summary_table.to_string())

# Scraping dynamic page

In [None]:
import time
import random
import requests
import pandas as pd
import google.generativeai as genai
from bs4 import BeautifulSoup
from urllib.parse import urljoin

* Step 1: Get product listings
* Step 2: Summarise description in a short sentence
* Step 3: Process the steps above on the website

In [15]:
"""
Let's define a class to do this task

!!! Please locate the website and check if other selectors or PATH can be used to find the corresponding info
"""

class GitzScraper:
    def __init__(self, api_key):
        self.base_url = "https://gitz.bz"
        self.api_key = api_key
        
        # Configure GenAI if you want to summarise reviews
        genai.configure(api_key=self.api_key)
        self.model = genai.GenerativeModel('gemini-pro')
        
        # Optional headers to mimic a normal browser visit
        self.headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/122.0.0.0 Safari/537.36"
            )
        }

    def random_delay(self):
        """Introduce a random delay to avoid hitting the server too quickly."""
        time.sleep(random.uniform(2, 5))

    def get_soup(self, url):
        """
        Fetch HTML content from a URL using Requests and
        return a BeautifulSoup object.
        """
        response = requests.get(url, headers=self.headers)
        response.raise_for_status()  # Raise an error for bad statuses
        return BeautifulSoup(response.text, 'html.parser')

    def get_product_listings(self, url):
        """
        Fetch product listings from the given page on gitz.bz.
        Please inspect the website and use suitable way to locate the pages!
        """
        products = []
        try:
            print(f"Accessing: {url}")
            soup = self.get_soup(url)

            # Step 1: Get the product cards
            # Locate all the product cards by <li class="productgrid--item">
            product_cards = soup.find_all('li', class_='productgrid--item')
            print(f"Found {len(product_cards)} product cards.")

            for card in product_cards:
                try:
                    # Product name & link
                    # e.g. <h2 class="productitem--title"><a href="...">PRODUCT NAME</a></h2>
                    name_container = card.find('h2', class_='productitem--title')
                    if name_container:
                        link_tag = name_container.find('a')
                        product_name = link_tag.get_text(strip=True) if link_tag else "N/A"
                        product_url = urljoin(self.base_url, link_tag.get("href")) if link_tag else None
                    else:
                        product_name = "N/A"
                        product_url = None
    
                    # Price: 
                    # e.g. <div class="price--main"><span class="money">$653.60 BZD</span></div>
                    price_main = card.find('div', class_='price--main')
                    if price_main:
                        price_span = price_main.find('span', class_='money')
                        price = price_span.get_text(strip=True) if price_span else "N/A"
                    else:
                        price = "N/A"
    
                    # Step 2: Now fetch the product detail page for the full description
                    #         (class="description" as you mentioned).
                    if product_url:
                        description = self.get_product_description(product_url)

                        """
                        Using LLM to summarise the description
                        """ 
                        summary =  self.summarize_with_gemini(description, self.api_key)
                    else:
                        summary = "N/A"
    
                    products.append({
                        'Name': product_name,
                        'Price': price,
                        'Product URL': product_url,
                        'Description': summary
                    })
                    print(f"Found product: {product_name}")
    
                except Exception as e:
                    print(f"Error processing a product card: {str(e)}")
                    continue
    
        except Exception as e:
            print(f"Error fetching product listings: {str(e)}")
    
        return products

    def get_product_description(self, product_url):
        """
        Given a product URL, fetch the detail page and extract
        the product description from the <div class="description"> (or whichever).
        """
        try:
            # If your product detail pages require this same get_soup approach
            soup = self.get_soup(product_url)
    
            desc_div = soup.find('div', class_='product-description rte')
            if desc_div:
                # If the description text is nested further, you may need find() or get_text()
                return desc_div.get_text(strip=True)
            else:
                return "N/A"
    
        except Exception as e:
            print(f"Error fetching product description: {str(e)}")
            return "N/A"
        
    def process_search_results(self, search_url):
        """
        Main method: fetch product data from a search (or listing) page,
        for each product, summarise them,
        and save the results to a CSV file.
        """
        try:
            products = self.get_product_listings(search_url)
            if not products:
                print("No products found.")
                return pd.DataFrame()

            results = []
            for product in products:
                print(f"Processing product: {product['Name']}")
                self.random_delay()  # small delay

                results.append({
                    "Product Name": product["Name"],
                    "Price": product["Price"],
                    'Description': product["Description"]
                })

            df = pd.DataFrame(results)
            df.to_csv("gitz_bz_product_analysis.csv", index=False)
            print("Results saved to gitz_bz_product_analysis.csv")
            return df

        except Exception as e:
            print(f"Error processing search results: {str(e)}")
            return pd.DataFrame()

    def summarize_with_gemini(self, text, api_key):
        """
        Generate summary using Gemini API
        """
        try:
            # Configure the Gemini API
            genai.configure(api_key=api_key)
            
            # Initialize the model (using the most capable model available)
            model = genai.GenerativeModel('gemini-pro')
            
            # Prepare the prompt
            prompt = f"""Please provide a concise summary of the following text in one sentence:
            
            {text[:30000]}"""  # Gemini has a higher token limit than GPT-3.5
            
            # Generate response
            response = model.generate_content(prompt)
            
            return response.text
        except Exception as e:
            print(f"Error with Gemini API: {str(e)}")
            return None

In [None]:
scraper = GitzScraper(GEMINI_API_KEY)

search_url = "https://gitz.bz/search?type=product&q=wood*+desk*"

results = scraper.process_search_results(search_url)

if not results.empty:
    print("\nProduct Analysis Results:")
    print(results)
else:
    print("No results found.")

# Web Scraping with Third-Party Platform

For example,**BuildShip - Website Scraping Playground**, https://llm-web-crawler.vercel.app/

For example, in the above website
- `Select Scrape Mode`: LLM Extraction
- `URL`: https://gitz.bz/search?type=product&q=desk*
- `Selector`: .productgrid--wrapper
- `Extraction Fields`: title, price, ratings, description
- `Extraction Mode`: HTML