# Парсинг
> Многие сайты используют JS-рендеринг. Простой запрос в таком случае получит только HTML "каркас"

> Модуль `selenium` - позволяет программно управлять реальным веб-браузером. Это решает многие проблемы

In [3]:
!pip install selenium --break-system-packages


Defaulting to user installation because normal site-packages is not writeable
Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting urllib3<3.0,>=2.5.0 (from urllib3[socks]<3.0,>=2.5.0->selenium)
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting sortedcontainers (from trio~=0.30.0->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-p

## Пробуем `requests`

In [24]:
import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Gecko/20100101 Firefox/143.0'
}

response = requests.get('https://www.wildberries.ru/catalog/knigi/hudozhestvennaya-literatura/fantastika-i-fentezi', headers=headers)
print(f"Status Code: {response}")

html = response.text
soup = BeautifulSoup(html)
print(soup.prettify())

Status Code: <Response [200]>
<!DOCTYPE html>
<html class="adaptive" lang="ru" translate="no">
 <head>
  <meta charset="utf-8"/>
  <meta content="telephone=no" name="format-detection"/>
  <meta content="no-referrer-when-downgrade" name="referrer"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <meta content="Wildberries" name="application-name"/>
  <meta content="61b95d33285e2612" name="yandex-verification"/>
  <meta content="guMau7oNvBifqUhZGuuGVtiGD0HkKe8v6uwYpvj2ZIg" name="google-site-verification"/>
  <meta content="app-id=597880187" name="apple-itunes-app"/>
  <link href="/manifestv2.json" rel="manifest"/>
  <link href="/yandex-tableau-manifest.json" rel="yandex-tableau-widget"/>
  <link href="/opensearch.xml" rel="search" title="test" type="application/opensearchdescription+xml"/>
  <link href="/favicon.ico" rel="icon" type="image/vnd.microsoft.icon"/>
  <link href="/icon.svg" rel="icon" type="image/svg+xml"/>
  <meta content="website" property="og:type

## Пробуем `selenium`

In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time
import re

### Получаем payload

In [19]:
def GetSoupFromPage(driver, page : int, delay : float):
    print(f"Getting payload from page {page}...")
    driver.get(f'https://www.wildberries.ru/catalog/knigi/hudozhestvennaya-literatura/fantastika-i-fentezi?page={page}&sort=popular')
    # Пролистываем до конца -- подгружаем товары
    ## JS вставка для определения высоты страницы
    time.sleep(2)
    last_h = 0.8*driver.execute_script("return document.body.scrollHeight")
    while True:
        # прокручиваем до конца несколько раз
        for i in range(5):
            inc = last_h / 5
            driver.execute_script(f"window.scrollTo(0, {inc * (i+1)});")
            time.sleep(0.3)
            
        time.sleep(0.5)
        new_h = 0.8*driver.execute_script("return document.body.scrollHeight")
        if new_h == last_h: break
        last_h = new_h
    # Получаем полный payload
    #driver.get(f'https://www.wildberries.ru/catalog/knigi/hudozhestvennaya-literatura/fantastika-i-fentezi?page={page}&sort=popular')
    html = driver.page_source
    soup = BeautifulSoup(html)
    print(f"Getting payload from page {page}: Done!")
    return soup

### Ищем нужное с помощью `BeautifulSoup`

In [11]:
class Product:
    id_ : int
    title : str
    discount_price : int
    full_price : int
    stars : float

In [12]:
def addSoupToProducts(soup, products):
    elements = soup.find_all('article')
    i = 0
    for card in elements:
        product = Product()
        # id
        sid = card.get('id')
        if not sid: continue
        product.id_ = int(sid[1:])
    
        # Название
        title_tag = card.find('a')
        if not title_tag: continue
        title = title_tag.get('aria-label')
        if not title: continue
        product.title = title
    
        # Цена
        price_tag = card.find('span', class_="price__wrap")
        #print(price_tag.prettify())
        if not price_tag: continue
        ## Скидочная
        discount_price = price_tag.find('ins').encode_contents().decode('UTF-8').replace('\xa0', '')
        #print(discount_price[0:-1])
        if not discount_price: continue
        product.discount_price = int(f'{discount_price}'[0:-1])
        
        ## Полная
        full_price = price_tag.find('del').encode_contents().decode('UTF-8').replace('\xa0', '')
        if not full_price: product.full_price = product.discount_price
        else: product.full_price = int(f'{full_price}'[0:-1])
    
        # Оценка
        stars_tag = card.find('span', class_='address-rate-mini address-rate-mini--sm')
        if not stars_tag: continue
        stars = stars_tag.encode_contents().decode('UTF-8').replace('\xa0', '')
        if not stars: product.stars = -1 # если нет оценки, ставим -1
        else: product.stars = float(stars.replace(',', '.'))
        products.append(product)
        i += 1
    return i



In [23]:
# Небольшой фильтр
def FiltrateProducts(products):
    # Сортируем по цене
    products.sort(key=lambda p: p.discount_price)
    for product in products:
        continue
        if 99 < product.discount_price < 10_000:
            if product.stars < 1:
                continue
        products.remove(product)

### Сохраняем данные в `csv` таблицу

In [14]:
import csv

In [15]:
def SaveProductsToCsv(products, filename : str):
    print("[1]     Saving to out.csv...")
    i = 0
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, delimiter=';')
        writer.writerow(["#ID", "TITLE", "DISCOUNT PRICE", "FULL PRICE", "STARS"])
        for product in products:
            i+=1
            writer.writerow([product.id_, product.title.replace(';', ','), product.discount_price, product.full_price, product.stars])
    print(f"[1]     Saving done! ({i} rows)")

## Основная часть


In [16]:
pages = list(range(1, 6))
print(pages)

[1, 2, 3, 4, 5]


### Запуск selenium

In [17]:
# Настройки selenium
options = Options()
options.add_argument("--headless") # режим без GUI

print('Starting browser...')
# Драйвер (запускаем браузер)
driver = webdriver.Firefox(options=options)
print('Browser started!')

Starting browser...
Browser started!


### Получение данных

In [18]:
products : list[Product] = []
for page in pages:
    # Получаем payload
    soup = GetSoupFromPage(driver, page, delay=1)
    # Получаем нужные данные и кладём в список
    i = addSoupToProducts(soup, products)
    print(f"From page {page} added {i} products")

Getting payload from page 1...
Getting payload from page 1: Done!
From page 1 added 89 products
Getting payload from page 2...
Getting payload from page 2: Done!
From page 2 added 89 products
Getting payload from page 3...
Getting payload from page 3: Done!
From page 3 added 90 products
Getting payload from page 4...
Getting payload from page 4: Done!
From page 4 added 89 products
Getting payload from page 5...
Getting payload from page 5: Done!
From page 5 added 89 products


### Сохранение в файл

In [24]:
# Фильтруем
FiltrateProducts(products)

# Сохраняем в машиночитаемый файл
SaveProductsToCsv(products, 'out.csv')

print('[0] Done!')

[1]     Saving to out.csv...
[1]     Saving done! (446 rows)
[0] Done!


### Завершаем работу selenium (важно!!!)

In [25]:
# Закрываем браузер
driver.quit()