In [1]:
shop_names = ['metro',
 'lenta',
 'auchan',
 '5ka',
 'samokat',
 'okey',
 'magnit_express',
 'perekrestok',
 'verniy_fd',
 'vkusvill_darkstore',
 'eapteka',
 'perekrestokvprok',
 'vkusvill',
 'holodilnikru',
 'technopark',
 'dixy',
 'winlab',
 'smkt',
 'podruzhkafd',
 'ulybkaradugi',
 'rivegauchefd',
 'eaptekafd',
 'sbermarket_cards',
 'zoopt',
 'ozerki_',
 'aloeapteka',
 'goldfish',
 'lenoblpharm',
 'pervayapomosh',
 'apteka36_6',
 'Nevis',
 'doctorstoletov',
 'hvalovskievody',
 'chay',
 'dekanto',
 'flawery',
 'Sexdollspb',
 'Cvety_Gollandii',
 'roza78_ru',
 'Magic_Flower',
 'Azbukatsvetov',
 'bflorum',
 '5Tsvetov',
 'Fresa_Flora_design_studio',
 'giper',
 'StudiyaBermyakovyh',
 'ByketyMigom',
 'Dos_Flores',
 'Cvetovik',
 'flowersshop',
 'buket-piter',
 'ruby',
 'gryadka-1',
 'love-flower',
 'almond-roses',
 'kaktus24',
 'blumen-bar',
 'florastation',
 'k-a-rcher',
 'tainstvennyy-sad-tsvetov']

In [2]:
import time
import random 
import re
import pandas as pd
from selenium.webdriver.common.by import By
from selenium import webdriver

In [3]:
# locators for category name, buttons and product cards
CATEGORY_NAME_TYPE_1 = (By.XPATH, "//span[contains(@class, 'RootCatalogItem_text')]")
CATEGORY_NAME_TYPE_2 = (By.XPATH, "//h3[contains(@class, 'CategoriesMenuListTitle')]")
BUTTON_CATALOG = (By.XPATH, "//div[contains(@class, 'styles_catalogBtnContainer')]/button[@data-qa='catalog-button']")
TO_HOME = (By.XPATH, "//a[contains(@class,'NavigationLink_root') and (contains(@class, 'toHomeLink'))]")
PRODUCTS = (By.XPATH, "//h3[contains(@class, 'ProductCard_title')]")
PRICES = (By.XPATH, "//div[contains(@class, 'CommonProductCard_price')]/div[contains(@class, 'CommonProductCard_price')]") 

In [4]:
def click_to_element(locator, driver):
    """ This function is to click to web element """
    element = find_element(locator, driver)
    element.click()
    return element

In [5]:
def get_elements_texts(locator, driver):
    """ This function is to find web element """
    elements = driver.find_elements(*locator)
    return [element.text for element in elements]

In [6]:
def find_element(locator, driver):
    """ This function is to find web element """
    element = driver.find_element(*locator)
    driver.execute_script("return arguments[0].scrollIntoView();", element)
    return element

In [7]:
def shop_parsing(shop_name):
    """ This function is to parse products in current shop """
    data = pd.DataFrame(columns=["Товар", "Цена"])
    def products_parsing(category_names, num, flag):
        """ This function is to parse products names and prices """
        text = category_names[num]
        if flag:
            CATEGORY_BY_TEXT = (By.XPATH, f"//h3[contains(@class, 'CategoriesMenuListTitle') and text()='{text}']")
        else:
            CATEGORY_BY_TEXT = (By.XPATH, f"//span[contains(@class, 'RootCatalogItem_text') and text()='{text}']")
        click_to_element(CATEGORY_BY_TEXT, driver)
        time.sleep(5)
        products = get_elements_texts(PRODUCTS, driver)
        prices = get_elements_texts(PRICES, driver)
        return pd.DataFrame({"Товар": products, "Цена": prices})
        
    driver = webdriver.Chrome()
    
    driver.get(f"https://sbermarket.ru/{shop_name}")
    time.sleep(5)
    # We have to use the try/except construct because of the different behavior of the site in fullscreen and windowed mode
    try:
        click_to_element(BUTTON_CATALOG, driver)
        category_names = [name for name in get_elements_texts(CATEGORY_NAME_TYPE_2, driver) if name]
        category_indexes = random.sample(range(len(category_names)), min(3, len(category_names)))
        for index in category_indexes:
            data = pd.concat([data, products_parsing(category_names, index, True)])
            click_to_element(BUTTON_CATALOG, driver)
            time.sleep(5)
        
    except:
        category_names = [name for name in get_elements_texts(CATEGORY_NAME_TYPE_1, driver) if name]
        category_indexes = random.sample(range(len(category_names)), min(3, len(category_names)))
        for index in category_indexes:
            data = pd.concat([data, products_parsing(category_names, index, False)])
            click_to_element(TO_HOME, driver)
            time.sleep(5)
    
    driver.quit()
    return data

In [8]:
data = pd.DataFrame(columns=["Товар", "Цена"])

In [9]:
shop_indexes = random.sample(range(len(shop_names)), 3)
for index in shop_indexes:
    data = pd.concat([data, shop_parsing(shop_names[index])], ignore_index=True)

In [10]:
def preprocess_text(text):
    """ This function is to preprocess text in price-column """
    text = re.sub(r'^.*\n', '', text)
    text = re.sub(r',', '.', text)
    text = re.sub(r' ₽', '', text)
    text = re.sub(r' ', '', text)
    return text

In [11]:
data['Цена'] = data['Цена'].apply(preprocess_text)
data

Unnamed: 0,Товар,Цена
0,Букет Тюльпаны микс 15 шт,3751
1,Букет Тюльпаны и нарциссы,4376
2,Букет Тюльпаны микс 51 шт,12501
3,Букет Тюльпаны и нарциссы,5626
4,Букет Тюльпаны микс 25 шт,6001
...,...,...
68,Эффекс трибулус таблетки 60 шт,2281.00
69,Гинкоум ангиопротектор капсулы 40 мг 60 шт,880.00
70,Шампунь Низорал против перхоти для всех типов ...,719.00
71,Шампунь Кето плюс против перхоти для всех типо...,1119.00


In [12]:
data.to_csv('Price.csv', index=False)