# **COMPETITIVE PRICING ANALYSIS USING WEB SCRAPING**
##### **by Lucila Aldana Qui√±onez | Marketing Data Analyst**

In [97]:
# Import libraries

import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, date
import time

# **Competitor A: GPSFarma**

In [108]:
# Configuration

category_urls_a = [
    "https://gpsfarma.com/categorias/cuidado-de-la-piel/corporales.html",
    "https://gpsfarma.com/categorias/cuidado-de-la-piel/faciales.html",
    "https://gpsfarma.com/categorias/cuidado-de-la-piel/solares.html",
    "https://gpsfarma.com/categorias/cuidado-oral.html",
    "https://gpsfarma.com/categorias/cuidado-del-cabello.html"
]

competitor_a = "GPSFarma"

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

In [109]:
# Price data cleaning

def clean_price(price_text):
    price_text = (
        price_text.replace("$", "")
        .replace(".", "")
        .replace(",", ".")
        .replace("\xa0", "")
        .strip()
    )
    try:
        return float(price_text)
    except ValueError:
        return None

In [110]:
# Product data extraction

def extract_product_data(product, category_url):
    name_tag = product.find("strong", class_="product-item-name")
    brand_tag = product.find("div", class_="product-item-brand")
    price_tag = product.find("span", class_="price")
    link_tag = product.find("a", class_="product-item-link")
    discount_tag = product.find("img", alt=lambda x: x and "Off" in x)

    raw_name = name_tag.get_text(" ", strip=True) if name_tag else None

    if not raw_name or not link_tag:
        return None

    availability = "Out of stock" if "Out of stock" in product.get_text() else "In stock"

    return {
        "product_name": raw_name,
        "brand": brand_tag.get_text(strip=True) if brand_tag else None,
        "price": clean_price(price_tag.get_text()) if price_tag else None,
        "discount": discount_tag.get("alt") if discount_tag else None,
        "availability": availability,
        "category": category_url,
        "subcategory": category_url.split("/")[-1].replace(".html", ""),
        "product_url": link_tag.get("href"),
        "competitor": competitor_a,
        "scraping_date": datetime.today().date()
    }

In [111]:
# Category-level scraping

def scrape_gpsfarma_category(category_url, max_pages=150):
    page = 1
    products = []
    seen_urls = set()

    while page <= max_pages:
        url = f"{category_url}?p={page}"
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        product_cards = soup.find_all("li", class_="product-item")

        if not product_cards:
            break

        print(f"Scraping page {page} - products: {len(product_cards)}")

        new_products = 0

        for product in product_cards:
            product_data = extract_product_data(product, category_url)
            if not product_data:
                continue

            if product_data["product_url"] in seen_urls:
                continue

            seen_urls.add(product_data["product_url"])
            products.append(product_data)
            new_products += 1

        if new_products == 0:
            print("No new products, scraping has stopped.")
            break

        page += 1

    return products

In [112]:
# Complete GPSFarma scraping

all_products = []

for url in category_urls_a:
    print(f"\nCategory: {url}")
    products = scrape_gpsfarma_category(url, max_pages=150)
    all_products.extend(products)

print(f"\nTotal products extracted: {len(all_products)}")


Category: https://gpsfarma.com/categorias/cuidado-de-la-piel/corporales.html
Scraping page 1 - products: 13
Scraping page 2 - products: 13
Scraping page 3 - products: 13
Scraping page 4 - products: 13
Scraping page 5 - products: 13
Scraping page 6 - products: 13
Scraping page 7 - products: 13
Scraping page 8 - products: 13
Scraping page 9 - products: 13
Scraping page 10 - products: 13
Scraping page 11 - products: 13
Scraping page 12 - products: 13
Scraping page 13 - products: 13
Scraping page 14 - products: 13
Scraping page 15 - products: 13
Scraping page 16 - products: 13
Scraping page 17 - products: 13
Scraping page 18 - products: 13
Scraping page 19 - products: 13
Scraping page 20 - products: 13
No new products, scraping has stopped.

Category: https://gpsfarma.com/categorias/cuidado-de-la-piel/faciales.html
Scraping page 1 - products: 13
Scraping page 2 - products: 13
Scraping page 3 - products: 13
Scraping page 4 - products: 13
Scraping page 5 - products: 13
Scraping page 6 - pro

In [113]:
# Export to CSV

today = date.today().strftime("%Y-%m-%d")
df = pd.DataFrame(all_products)

df.to_csv(
    f"products_gpsfarma_{today}.csv",
    index=False,
    encoding="utf-8-sig"
)

print("CSV file saved successfully.")

CSV file saved successfully.


# **Competitor B: Farmaonline**

In [2]:
# Configuration

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

BASE_URL = "https://www.farmaonline.com"
competitor_b = "Farmaonline"

In [3]:
# Price data cleaning

def clean_price(price_text):
    price_text = (
        price_text.replace("$", "")
        .replace(".", "")
        .replace(",", ".")
        .replace("\xa0", "")
        .strip()
    )
    try:
        return float(price_text)
    except:
        return None

In [4]:
# Product data extraction

def scrape_farmaonline_first_page(url, category, subcategory=None):
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    product_cards = soup.find_all("article")

    products = []

    for product in product_cards:
        name_tag = product.find(
            "span",
            class_="vtex-product-summary-2-x-productBrand vtex-product-summary-2-x-brandName"
        )
        link_tag = product.find("a", href=True)

        if not name_tag or not link_tag:
            continue

        brand_tag = product.find("span", class_="vtex-product-summary-2-x-productBrandName")
        price_tag = product.find("span", class_="vtex-product-price-1-x-sellingPriceValue")
        discount_tag = product.find("span", class_="vtex-product-price-1-x-savingsPercentage")

        availability = (
            "Out of stock"
            if product.find("div", class_="farmaonline-product-summmary-elements-0-x-nope_stock")
            else "In stock"
        )

        products.append({
            "product_name": name_tag.get_text(strip=True),
            "brand": brand_tag.get_text(strip=True) if brand_tag else None,
            "price": clean_price(price_tag.get_text()) if price_tag else None,
            "discount": discount_tag.get_text(strip=True) if discount_tag else None,
            "availability": availability,
            "category": category,
            "subcategory": subcategory,
            "product_url": BASE_URL + link_tag["href"],
            "competitor": competitor_b,
            "scraping_date": datetime.today().date()
        })

    return products

In [114]:
# Category-level scraping

url = "https://www.farmaonline.com/cuidado-personal/cuidado-de-la-piel/corporales?order=OrderByTopSaleDESC"

data = scrape_farmaonline_first_page(
    url=url,
    category="cuidado-de-la-piel",
    subcategory="corporales"
)

print(f"Products extracted: {len(data)}")

Products extracted: 0


#### Farmaonline uses JavaScript-based dynamic rendering (VTEX), which prevents product data extraction through traditional HTML scraping with Requests and BeautifulSoup.

# **Competitor C: Farmacity**

In [115]:
# Configuration

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

base_url = "https://www.farmacity.com"
COMPETITOR = "Farmacity"

categories = [
    {
        "category": "dermocosmetica",
        "subcategory": "corporal",
        "url": "https://www.farmacity.com/dermocosmetica/corporal"
    },
    {
        "category": "dermocosmetica",
        "subcategory": "rostro",
        "url": "https://www.farmacity.com/dermocosmetica/rostro"
    },
    {
        "category": "dermocosmetica",
        "subcategory": "solar",
        "url": "https://www.farmacity.com/dermocosmetica/solar"
    },
    {
        "category": "cuidado-personal",
        "subcategory": "cuidado-oral",
        "url": "https://www.farmacity.com/cuidado-personal/cuidado-oral"
    },
    {
        "category": "dermocosmetica",
        "subcategory": "pelo",
        "url": "https://www.farmacity.com/dermocosmetica/pelo"
    }
]

In [116]:
# Price data cleaning

def clean_price(price_text):
    if not price_text:
        return None
    price_text = price_text.replace("$", "").replace(".", "").replace(",", ".")
    return float(price_text)

In [117]:
# Product data extraction

def extract_product_data_farmacity(product, category, subcategory):

    name_tag = product.select_one(
        "span.vtex-product-summary-2-x-productBrand"
    )

    brand_tag = product.select_one(
        "div.vtex-product-summary-2-x-productBrandContainer span.vtex-product-summary-2-x-productBrandName"
    )

    price = None
    price_parts = product.select(
        "span.vtex-product-price-1-x-sellingPriceValue span.vtex-product-price-1-x-currencyInteger"
    )

    if price_parts:
        price_text = "".join(p.get_text(strip=True) for p in price_parts)
        try:
            price = float(price_text)
        except ValueError:
            price = None

    discount_tag = product.select_one(
        "p.farmacityar-app-highlights-admin-0-x-discountText"
    )

    link_tag = product.find("a", href=True)

    if not name_tag or not link_tag:
        return None

    return {
        "product_name": name_tag.get_text(strip=True),
        "brand": brand_tag.get_text(strip=True) if brand_tag else None,
        "price": price,
        "discount": discount_tag.get_text(strip=True) if discount_tag else None,
        "availability": "In stock",
        "category": category,
        "subcategory": subcategory,
        "product_url": BASE_URL + link_tag["href"],
        "competitor": COMPETITOR,
        "scraping_date": datetime.today().date()
    }

In [118]:
# Category-level scraping

def scrape_farmacity_category(url, category, subcategory):
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")

    products = soup.select("div[data-af-element='search-result']")

    results = []

    for product in products:
        data = extract_product_data_farmacity(product, category, subcategory)
        if data:
            results.append(data)

    print(f"{subcategory}: {len(results)} products extracted")
    return results

In [119]:
# Complete Farmacity scraping

all_products = []

for cat in categories:
    products = scrape_farmacity_category(
        url=cat["url"],
        category=cat["category"],
        subcategory=cat["subcategory"]
    )
    all_products.extend(products)

corporal: 7 products extracted
rostro: 7 products extracted
solar: 7 products extracted
cuidado-oral: 7 products extracted
pelo: 7 products extracted


In [120]:
# Export to CSV

today = date.today().strftime("%Y-%m-%d")
df = pd.DataFrame(all_products)

df.to_csv(
    f"products_farmacity_{today}.csv",
    index=False,
    encoding="utf-8-sig"
)

print(f"\nTotal products extracted: {len(df)}")
print("CSV file saved successfully.")


Total products extracted: 35
CSV file saved successfully.


#### Farmacity uses JavaScript-based dynamic rendering with infinite scroll (VTEX). This script extracts only the products available in the initial HTML load, ensuring scraping stability and reproducibility. While product names and URLs can be retrieved through traditional HTML scraping, prices, brands, and discounts are dynamically loaded on the client side and are not consistently accessible.

# **Competitor D: Farmalife**

In [121]:
# Configuration

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

BASE_URL = "https://www.farmalife.com.ar"
COMPETITOR = "Farmalife"

categories = [
    {
        "category": "dermo",
        "subcategory": "corporal",
        "url": "https://www.farmalife.com.ar/dermo/corporal"
    },
    {
        "category": "cuidado-personal",
        "subcategory": "crema-facial",
        "url": "https://www.farmalife.com.ar/cuidado-personal/cuidado-de-la-piel/crema-facial"
    },
    {
        "category": "dermo",
        "subcategory": "protector-solar",
        "url": "https://www.farmalife.com.ar/dermo/protector-solar"
    },
    {
        "category": "cuidado-personal",
        "subcategory": "cuidado-oral",
        "url": "https://www.farmalife.com.ar/cuidado-personal/cuidado-oral"
    },
    {
        "category": "cuidado-personal",
        "subcategory": "capilar",
        "url": "https://www.farmalife.com.ar/cuidado-personal/capilar"
    }
]

In [122]:
# Product data extraction

def extract_product_data_farmalife(product, category, subcategory):
    name_tag = product.select_one(
        "span.vtex-product-summary-2-x-productBrand"
    )

    link_tag = product.select_one(
        "a.vtex-product-summary-2-x-clearLink"
    )

    if not name_tag or not link_tag:
        return None

    price_container = product.select_one(
        "span.vtex-product-price-1-x-sellingPriceValue"
    )

    price = None
    if price_container:
        integers = price_container.select(
            "span.vtex-product-price-1-x-currencyInteger"
        )
        fraction = price_container.select_one(
            "span.vtex-product-price-1-x-currencyFraction"
        )

        if len(integers) >= 2:
            whole = integers[0].text + integers[1].text
            if fraction:
                price = float(f"{whole}.{fraction.text}")
            else:
                price = float(whole)

    discount_tag = product.select_one(
        "span.vtex-product-price-1-x-savingsPercentage--shelf"
    )

    discount = discount_tag.get_text(strip=True) if discount_tag else None

    return {
        "product_name": name_tag.get_text(strip=True),
        "brand": name_tag.get_text(strip=True).split(" ")[0],
        "price": price,
        "discount": discount,
        "availability": "In stock",
        "category": category,
        "subcategory": subcategory,
        "product_url": BASE_URL + link_tag["href"],
        "competitor": COMPETITOR,
        "scraping_date": datetime.today().date()
    }

In [123]:
# Category-level scraping

def scrape_farmalife_category(cat):
    products = []
    step = 50
    start = 0

    while True:
        params = {
            "ft": cat["subcategory"].replace("-", " "),
            "_from": start,
            "_to": start + step - 1
        }

        response = requests.get(BASE_API, headers=HEADERS, params=params)
        response.raise_for_status()
        data = response.json()

        if not data:
            break

        for p in data:
            item = p["items"][0]
            seller = item["sellers"][0]["commertialOffer"]

            price = seller["Price"]
            list_price = seller["ListPrice"]
            discount = None

            if list_price and list_price > price:
                discount = f"{round((1 - price / list_price) * 100)}%"

            products.append({
                "product_name": p["productName"],
                "brand": p.get("brand"),
                "price": price,
                "discount": discount,
                "availability": "In stock" if seller["AvailableQuantity"] > 0 else "Out of stock",
                "category": cat["category"],
                "subcategory": cat["subcategory"],
                "product_url": f"https://www.farmalife.com.ar{p['link']}",
                "competitor": COMPETITOR,
                "scraping_date": datetime.today().date()
            })

        start += step

    print(f"{cat['category']} / {cat['subcategory']} - products: {len(products)}")
    return products

In [124]:
# Complete Farmalife scraping

all_products = []

for cat in categories:
    all_products.extend(scrape_farmalife_category(cat))

dermo / corporal - products: 406


HTTPError: 400 Client Error: Bad Request for url: https://www.farmalife.com.ar/api/catalog_system/pub/products/search?ft=crema+facial&_from=0&_to=49

#### Farmalife uses VTEX with dynamic rendering and unstable internal endpoints, making HTML or API scraping with Requests non-reproducible and unreliable.