In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromiumService

from pandas import DataFrame, read_html, concat, ExcelFile, ExcelWriter, Series
from pandas import isna as pd_isna
from pandas import merge as pd_merge
from math import isnan as math_isnan

from random import uniform as random_uniform
# from random import randint as random_randint
from time import sleep as time_sleep

from datetime import datetime as dt

from win32clipboard import OpenClipboard, EmptyClipboard, SetClipboardText, CF_UNICODETEXT, CloseClipboard
from pyautogui import hotkey as pyt_hotkey

from re import findall as re_findall
from re import search as re_search

from tqdm.notebook import tqdm

from shutil import copyfile as shutil_copyfile



ФУНКЦИИ

In [2]:
def random_sleep(upper_bound:int, lower_bound=0):
    random_digit_parse = random_uniform(lower_bound, upper_bound)
    time_sleep(random_digit_parse)

In [3]:
def get_browser(browser_full_path: str,
                driver_full_path: str,
                wait_secs: int=10):
    # готовый к работе веб-драйвер возращается методом класса
    options = webdriver.ChromeOptions()
    ###
    options.add_argument(r"--user-data=C:\Users\tabakaev_mv\AppData\Local\Yandex\YandexBrowser\User Data\Default")
    ###
    options.binary_location = browser_full_path
    options.add_argument("--disable-notifications")
    options.add_argument("--disable-web-security")
    options.add_argument("--start-maximized")
    options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(executable_path=driver_full_path,
                              options=options)
    driver.implicitly_wait(wait_secs)
    return driver

In [4]:
def close_browser(driver):
    input('После нажатия клавиши браузер будет закрыт!')
    try:
        driver.close()
        print('Инициализация закрытия Selenium-браузера прошла успешно!')
    except:
        print('Ошибка закрытия Selenium-браузера')

In [5]:
def delete_nan_rows(df: DataFrame):
    # НАЧАЛО ФУНКЦИИ ---//---
    # НЕОБХОДИМО УЗНАТЬ КОЛИЧЕСТВО СТРОК СО ВСЕМИ ПРОПУЩЕННЫМИ ЗНАЧЕНИЯМИ (по всем колонкам)!!!
    idx = list(df.index[df.isnull().all(1)])
    print(idx)
    if idx:
        # кол-во удаляемых строк, а не одна строка!!!
        print(f'Необходимо удалить <{len(idx)}> строк ДФ!')
    print()
    print(f'Размерность ДФ ДО УДАЛЕНИЯ i строк: <{df.shape}>')
    print()
    # reset index???
    df.drop(idx,inplace=True)
    df.reset_index(drop=True, inplace=True)
    print(f'Размерность ДФ ПОСЛЕ УДАЛЕНИЯ i строк: <{df.shape}>')
    return(df)
    # КОНЕЦ ФУНКЦИИ ---//---

In [6]:
def parse_part_df(driver,
                  df_page_index:int=1,
                  df_colnames_page_index:int=0,
                  need_delete_nans:bool=True):
    # парсим таблицу со всем перечнем заявлений на получение ЭЛМК!!!
    # в переключателе выводить макисмальное количество строк по ЭЛМК!!!
    # подождать пока все строки прогрузятся!!!
    page_content_list = read_html(driver.page_source)
    df = DataFrame(page_content_list[df_page_index])
    df_colnames = list(DataFrame(page_content_list[df_colnames_page_index]))
    df.columns = df_colnames
    if need_delete_nans:
        df = delete_nan_rows(df=df)
    return(df, df.shape[0])

In [7]:
def get_element_title(driver, xpath, timeout = 15):
        try:
            element_present = EC.presence_of_element_located((By.XPATH, xpath))
            WebDriverWait(driver, timeout).until(element_present)
        except TimeoutException:
            print("Timed out waiting for page to load")
            return
        found_element = driver.find_element(By.XPATH, xpath)
        return(found_element.get_attribute('title'))

In [8]:
def get_element_by_xpath(driver, xpath, timeout = 15):
    try:
        element_present = EC.presence_of_element_located((By.XPATH, xpath))
        WebDriverWait(driver, timeout).until(element_present)
    except TimeoutException:
        print("Timed out waiting for page to load")
        return
    found_element = driver.find_element(By.XPATH, xpath)
    return found_element

In [14]:
def get_element_text(driver,
                     css_selector: str,
                     timeout: int = 3):
    # found_element = driver.find_element(By.CSS_SELECTOR, css_selector)
    try:
        element_present = EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
        WebDriverWait(driver, timeout).until(element_present)
    except TimeoutException or NoSuchElementException:
        print("Timed out waiting for page to load")
        return ""
    found_element = driver.find_element(By.CSS_SELECTOR, css_selector)
    found_elem_text = found_element.text
    return found_elem_text

In [15]:
def get_bg_content(driver, css_selector_info: dict):
    if type(css_selector_info) != dict:
        raise ValueError("Неверный тип переданного аргумента <css_selector_info>!")
    [v.append(get_element_text(driver=driver, css_selector=v[0]))
     for (k,v) in css_selector_info.items()]
    return css_selector_info

In [43]:
css_selectors = {
    "bg_name": [".product-title h1"],
    "bg_round_duration": [".product-resume .time p span"],
    "bg_players": [".product-resume .players p span"],
    "bg_recommended_age": [".product-resume .age p span"],
    "bg_languague": [".product-resume .lang p span"],
    "bg_brand": [".product-resume .manufacturer p span"],
    "bg_range": [".product-resume .range p"],
    "bg_complect": [".complect .lists ul ul"],
    # "bg_about": [".left-content .product-block .about full-height"],
    "bg_about": [".left-content section div"],
    "bg_price": [".right-content section div span"],
    # "bg_price": [".right-content .buy .flip-front flip-front--price-block .price"],
    "href_to_original": [".labels-big"],
    "href_rules": [".rules .document a[href$='.pdf']"],
    "bg_tags": [".tags p"],
    "page_content": ["html"]
}

In [44]:
%%time
start_html = "https://hobbyworld.ru/kragmorta"
with webdriver.Chrome(service=ChromiumService(ChromeDriverManager().install())) as driver:
    driver.get(start_html)
    result = get_bg_content(driver=driver, css_selector_info=css_selectors)
    input()

Timed out waiting for page to load

Wall time: 2min 31s


In [46]:
# смотрим полученный результат
print(*[(k,v) for (k,v) in result.items()], sep="\n")

('bg_name', ['.product-title h1', 'Крагморта'])
('bg_round_duration', ['.product-resume .time p span', 'от 20 до 40 минут'])
('bg_players', ['.product-resume .players p span', '2-8 человек'])
('bg_recommended_age', ['.product-resume .age p span', '8 лет и старше'])
('bg_languague', ['.product-resume .lang p span', 'Русский'])
('bg_brand', ['.product-resume .manufacturer p span', 'Hobby World'])
('bg_range', ['.product-resume .range p', ''])
('bg_complect', ['.complect .lists ul ul', 'Сам Ригор Мортис, Грандиозный и Устрашающий Архимаг – 1 персона\n1 Библиотека Ригора Мортиса\n1 рабочий стол\n8 Гоблинов разных цветов\n8 жетонов Гоблинов разных цветов\n6 жетонов Телепортов\n6 книжных полок\n24 карты Испепеляющего Взора\n58 карт Движения\n18 карт Книг Магии\nПравила игры\n  Размер коробки: 277х194х67 мм\nРазмер карт: 63x88 мм '])
('bg_about', ['.left-content section div', 'Крагморта'])
('bg_price', ['.right-content section div span', '1 490 руб.'])
('href_to_original', ['.labels-big', '']

ЧЕРНОВИК!!!

In [None]:
# with webdriver.Chrome(service=ChromiumService(ChromeDriverManager().install())) as driver:
#     driver.get(start_html)
#     try:
#         df_1, df_1_rows = parse_part_df(driver, df_page_index=1)
#     except ValueError as e:
#         print(f"{e} Probably no tables found")
#     try:
#         df_2, df_2_rows = parse_part_df(driver, df_page_index=2)
#     except ValueError as e:
#         print(f"{e} Probably no tables found")
#     try:
#         df_3, df_3_rows = parse_part_df(driver, df_page_index=3)
#     except ValueError as e:
#         print(f"{e} Probably no tables found")
#     try:
#         df_0, df_0_rows = parse_part_df(driver, df_page_index=0)
#     except ValueError as e:
#         print(f"{e} Probably no tables found")
#     df_2, df_2_rows = parse_part_df(driver, df_page_index=2)
#     df_3, df_3_rows = parse_part_df(driver, df_page_index=3)

In [23]:
# xpath_elem = "/html/body/div[1]/section[2]/div/div[1]/section/div[2]/div[2]/div/div[1]/p/span/text()"

In [None]:
# %%time
# start_html = "https://hobbyworld.ru/kragmorta"
# browser_path = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
# chrome_driver_path = r"C:\ChromeDriver\chromedriver.exe"
# # driver = get_browser(browser_full_path=browser_path,
# #            driver_full_path=chrome_driver_path)
# # input('Для закрытия браузера жмакаем Enter!')
# with webdriver.Chrome(service=ChromiumService(ChromeDriverManager().install())) as driver:
#     driver.get(start_html)
# #     el_xpath = get_element_by_xpath(driver=driver, xpath=xpath_elem)
# #     try:
# #         print(el_xpath.text)
# #     except:
# #         print(el_path)
#     element = driver.find_element(By.CSS_SELECTOR, ".product-resume .time p span")
# #     page_source = driver.get_source()
# #     print(page_source)
# #     element = driver.find_element_by_css_selector('.product-resume .time p span')
#     print(element.text)
# #     print(element)
#     input()

In [7]:
# /html/body/div[1]/section[2]/div/div[1]/section/div[2]/div[2]/div/div[1]/p/span/text()

In [None]:
# element = driver.find_element_by_css_selector('.product-resume .time p span')