### **3.1. Этап 1. Формирование экспериментального корпуса текстов**

In [1]:
import os
import time
import json
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

BASE_URL = 'https://ria.ru/lenta/'
WORD_LIMIT = 6000
PROCESSED_ARTICLES = set()

def fetch_and_parse_article(article_url):
    driver.get(article_url)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    title_tag = soup.find('div', {'class': 'article__title'})
    text_tags = soup.find_all('div', {'class': 'article__text'})
    date_time_tag = soup.find('div', {'class': 'article__info-date'})
    category_tag = soup.find('span', {'class': 'breadcrumbs__item-link-text'})
    
    data = {
        'header': title_tag.text.strip() if title_tag else None,
        'text': '\n'.join([tag.text.strip() for tag in text_tags]),
        'date': date_time_tag.text.strip() if date_time_tag else None,
        'url': article_url,
        'category': category_tag.text.strip() if category_tag else ''
    }
    
    word_count = len(data['text'].split())
    return data, word_count

def collect_new_articles(current_articles):
    new_articles = []
    for article_link in current_articles:
        href = article_link['href']
        if href not in PROCESSED_ARTICLES:
            try:
                article_data, _ = fetch_and_parse_article(href)
                new_articles.append(article_data)
                PROCESSED_ARTICLES.add(href)
            except Exception as e:
                print(f'Ошибка обработки статьи {href}: {e}')
    return new_articles

def main():
    driver.get(BASE_URL)
    wait = WebDriverWait(driver, 10)
    
    # Начинаем с начальной страницы
    initial_page_source = driver.page_source
    soup = BeautifulSoup(initial_page_source, 'html.parser')
    initial_articles = soup.select('div.list-item__content a.list-item__title')
    
    # Сначала обработаем начальные статьи
    all_articles = collect_new_articles(initial_articles)
    current_word_count = sum(len(article['text'].split()) for article in all_articles)
    
    # Нажимаем кнопку "Показать ещё" и добираемся до лимита слов
    while current_word_count < WORD_LIMIT:
        try:
            # Нажимаем кнопку подгрузки
            load_more_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'list_more')))
            load_more_button.click()
            time.sleep(2)  # Пауза для полной загрузки
            
            # Повторно считываем DOM и находим новые статьи
            updated_page_source = driver.page_source
            soup = BeautifulSoup(updated_page_source, 'html.parser')
            additional_articles = soup.select('div.list-item__content a.list-item__title')
            
            # Добавляем только новые статьи
            new_articles = collect_new_articles(additional_articles)
            all_articles.extend(new_articles)
            
            # Пересчитываем слова
            current_word_count = sum(len(article['text'].split()) for article in all_articles)
        except Exception as e:
            print(f'Ошибка в процессе подгрузки: {e}')
            break
    
    # Сохраняем статьи в JSONL
    with open('articles.jsonl', 'w', encoding='utf-8') as f:
        for article in all_articles:
            f.write(json.dumps(article, ensure_ascii=False) + "\n")
    
    driver.quit()

if __name__ == "__main__":
    main()

Ошибка в процессе подгрузки: Message: 
Stacktrace:
	GetHandleVerifier [0x0x7ff78eb230f5+79493]
	GetHandleVerifier [0x0x7ff78eb23150+79584]
	(No symbol) [0x0x7ff78e8a01ba]
	(No symbol) [0x0x7ff78e8f8067]
	(No symbol) [0x0x7ff78e8f832c]
	(No symbol) [0x0x7ff78e94be27]
	(No symbol) [0x0x7ff78e92074f]
	(No symbol) [0x0x7ff78e948b8b]
	(No symbol) [0x0x7ff78e9204e3]
	(No symbol) [0x0x7ff78e8e8e92]
	(No symbol) [0x0x7ff78e8e9c63]
	GetHandleVerifier [0x0x7ff78ede0dbd+2954061]
	GetHandleVerifier [0x0x7ff78eddb02a+2930106]
	GetHandleVerifier [0x0x7ff78edfb357+3061991]
	GetHandleVerifier [0x0x7ff78eb3d60e+187294]
	GetHandleVerifier [0x0x7ff78eb4557f+219919]
	GetHandleVerifier [0x0x7ff78eb2c294+116772]
	GetHandleVerifier [0x0x7ff78eb2c449+117209]
	GetHandleVerifier [0x0x7ff78eb12618+11176]
	BaseThreadInitThunk [0x0x7ffab8dd7374+20]
	RtlUserThreadStart [0x0x7ffaba05cc91+33]

