In [212]:
import datetime
from time import sleep, time

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService

import requests
from bs4 import BeautifulSoup
import csv
from pathlib import Path

In [213]:
filename = "articles_info.csv" # имя файла, в который будем сохранять результат
driver_path = "./chromedriver/chromedriver.exe" # укажите ваш путь к chromedriver, который вы загрузили ранее
base_dir= "./parse" # укажите директорию, в которую будем сохранять файл
user_agent = "Mozilla/5.0 ..." # ваш user-agent, узнать его можно тут: https://юзерагент.рф, смотреть через браузер Chrome
start_time = time() # время начала выполнения программы

In [214]:
def get_load_time(article_url, user_agent):
    #будем ждать 3 секунды, иначе выводить exception и присваивать константное значение
    try:
        # меняем значение заголовка. По умолчанию указано, что это python-код
        headers = {
            "User-Agent": user_agent
        }
        # делаем запрос по url статьи article_url
        response = requests.get(
            article_url, headers=headers, stream=True, timeout=3.000
        )
        # получаем время загрузки страницы
        load_time = response.elapsed.total_seconds()
    except Exception as e:
        print(e)
        load_time = ">3"
    return load_time

In [215]:
def write_to_file(output_list, filename, base_dir):
    open(Path(base_dir).joinpath(filename), "w")
    for row in output_list:
        with open(Path(base_dir).joinpath(filename), "a") as csvfile:
            fieldnames = ["id", "load_time", "rank", "points", "title", "comments_count", "url"]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(row)

In [216]:
def connect_to_base(browser, page_number):
    base_url = "https://news.ycombinator.com/news?p={}".format(page_number)
    for connection_attempts in range(1,4): # совершаем 3 попытки подключения
        try:
            browser.get(base_url)
            # ожидаем пока элемент table с id = 'hnmain' будет загружен на страницу
            # затем функция вернет True иначе False 
            WebDriverWait(browser, 5).until(
                EC.presence_of_element_located((By.ID, "hnmain"))
            )
            return True
        except Exception as e:
            print(e)
            print("Error connecting to {}.".format(base_url))
            print("Attempt #{}.".format(connection_attempts))
    return False

In [217]:
def parse_html(html, user_agent):
    soup = BeautifulSoup(html, "html.parser")
    output_list = []
   
    # ищем в объекте soup object id, rank, score и title статьи
    tr_blocks = soup.find_all("tr", class_="athing")
    article = 0
    for tr in tr_blocks:
        article_id = tr.get("id") # id
        article_url = tr.find_all("a")[1]["href"]

        # иногда статья располагается не на внешнем сайте, а на ycombinator
        # тогда article_url у нее не полный, а добавочный, с параметрами.
        # например item?id=200933. Для этих случаев будем добавлять url до полного
        if "item?id=" in article_url or "from?site=" in article_url:
            article_url = f"https://news.ycombinator.com/{article_url}"
        load_time = get_load_time(article_url, user_agent)
        # иногда рейтинга может не быть, поэтому воспользуемся try

        try:
            score = soup.find(id=f"score_{article_id}").string
            title = soup.find_all('a', href=article_url)[0].string
            comments_count = soup.find_all('a', href=("item?id=" + article_id))[1].string
            
            
        except Exception as e:
            print(e)
            score = "0 points"
            title = "no title"
            comments_count = "0 comments"
           
        article_info = {
            "id": article_id,
            "load_time": load_time,
            "rank": tr.span.string,
            "points": score,
            "title": title,# tr.find(class_="titleline").string,
            "comments_count": comments_count,
            "url": article_url
        }
        
        # добавляем информацию о статье в список
        output_list.append(article_info)
        article += 1
    return output_list

вариант для однопоточной работы

In [218]:
'''start_time = time() # время начала выполнения программы

# инициализируем веб драйвер
browser = webdriver.Chrome(
    service=ChromeService(executable_path=driver_path)
)

# перебираем страницы и собираем нужную информацию
for page_number in range(10):
    print("getting page " + str(page_number) + "...")
    if connect_to_base(browser, page_number):
        sleep(5)
        output_list = parse_html(browser.page_source, user_agent)
        write_to_file(output_list, filename, base_dir)

    else:
        print("Error connecting to hacker news")

# завершаем работу драйвера
browser.close()
sleep(1)
browser.quit()
end_time = time()
elapsed_time = end_time - start_time
print("run time: {} seconds".format(elapsed_time))'''

'start_time = time() # время начала выполнения программы\n\n# инициализируем веб драйвер\nbrowser = webdriver.Chrome(\n    service=ChromeService(executable_path=driver_path)\n)\n\n# перебираем страницы и собираем нужную информацию\nfor page_number in range(10):\n    print("getting page " + str(page_number) + "...")\n    if connect_to_base(browser, page_number):\n        sleep(5)\n        output_list = parse_html(browser.page_source, user_agent)\n        write_to_file(output_list, filename, base_dir)\n\n    else:\n        print("Error connecting to hacker news")\n\n# завершаем работу драйвера\nbrowser.close()\nsleep(1)\nbrowser.quit()\nend_time = time()\nelapsed_time = end_time - start_time\nprint("run time: {} seconds".format(elapsed_time))'

вариант для многопоточной работы

In [219]:
from concurrent.futures import ThreadPoolExecutor, wait

# Обернём процедуру парсинга страницы в функцию
def run_process(page_number, filename):
    browser = webdriver.Chrome(
        service=ChromeService(executable_path=driver_path)
    )
    if connect_to_base(browser, page_number):
        sleep(5)
        output_list = parse_html(browser.page_source, user_agent)
        write_to_file(output_list, filename, base_dir)
       
        browser.quit()
    else:
        print("Error connecting to hacker news")
        browser.quit()

# Засечём время выполнения кода
start_time = time()

futures = []

# Запустим процесс парсинга на нескольких потоках одновременно
with ThreadPoolExecutor() as executor:
    for number in range(10):
        futures.append(
            executor.submit(run_process, number, filename)
        )
       
wait(futures)
end_time = time()
elapsed_time = end_time - start_time
print("Elapsed run time: {} seconds".format(elapsed_time))

Message: 
Stacktrace:
	GetHandleVerifier [0x00ED72A3+45731]
	(No symbol) [0x00E62D51]
	(No symbol) [0x00D5880D]
	(No symbol) [0x00D8B940]
	(No symbol) [0x00D8BE0B]
	(No symbol) [0x00DBD1F2]
	(No symbol) [0x00DA8024]
	(No symbol) [0x00DBB7A2]
	(No symbol) [0x00DA7DD6]
	(No symbol) [0x00D831F6]
	(No symbol) [0x00D8439D]
	GetHandleVerifier [0x011E0716+3229462]
	GetHandleVerifier [0x012284C8+3523784]
	GetHandleVerifier [0x0122214C+3498316]
	GetHandleVerifier [0x00F61680+611968]
	(No symbol) [0x00E6CCCC]
	(No symbol) [0x00E68DF8]
	(No symbol) [0x00E68F1D]
	(No symbol) [0x00E5B2C7]
	BaseThreadInitThunk [0x75DB7BA9+25]
	RtlInitializeExceptionChain [0x775CBD3B+107]
	RtlClearBits [0x775CBCBF+191]

Error connecting to https://news.ycombinator.com/news?p=6.
Attempt #1.
Message: 
Stacktrace:
	GetHandleVerifier [0x00ED72A3+45731]
	(No symbol) [0x00E62D51]
	(No symbol) [0x00D5880D]
	(No symbol) [0x00D8B940]
	(No symbol) [0x00D8BE0B]
	(No symbol) [0x00DBD1F2]
	(No symbol) [0x00DA8024]
	(No symbol) [0

In [220]:
import pandas as pd

articles_data = pd.read_csv(
    './parse/articles_info.csv',
    names=["id", "load_time", "rank", "points", "title", "comments_count", "url"],
    encoding='cp1252'
)

articles_data.head()

Unnamed: 0,id,load_time,rank,points,title,comments_count,url
0,38305787,0.719098,91.0,273 points,A game about staring into the eyes of a stranger,84 comments,https://stranger.video/
1,38311453,0.305123,92.0,76 points,"Old engineering simulation games, now open sou...",12 comments,https://github.com/sim-museum/esports-for-engi...
2,38309611,0.720494,93.0,5567 points,OpenAI's board has fired Sam Altman,2465 comments,https://openai.com/blog/openai-announces-leade...
3,38312248,0.725664,94.0,41 points,Why I haven't run an Indiegogo campaign,3 comments,https://computerengineeringforbabies.com/blogs...
4,38323100,1.148624,95.0,4 points,Giagantic AMD APU at SC23 Meet the AMD MI300A ...,discuss,https://www.servethehome.com/giagantic-amd-apu...
