In [9]:
import time, re, json, random
import pandas as pd
from pathlib import Path
from selenium import webdriver
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from datetime import datetime, timedelta
from dataclasses import dataclass


In [10]:
def save_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding="utf-8") as output_file:
        for sample in json_list:
            json_line = json.dumps(sample, ensure_ascii=False)
            #json.dumps(sample, output_file)
            output_file.write(json_line + '\n')

def read_jsonl(read_file_path):
    with open(read_file_path, encoding="utf-8") as f:
        data = [json.loads(i) for i in f]
    return data

In [11]:
ria_topics = {
    "economy": 'Экономика', 
    "society" : 'Общество/Россия', 
    "science": 'Наука и техника', 
    "defense_safety": 'Силовые структуры', 
    "tourism_news": 'Туризм/Путешествия'
    }

In [12]:
@dataclass
class Article:
    id: str = None
    url: str = None
    title: str = None
    subtitle: str = None
    topic: str = None
    content: str = None
    datetime: str = None

In [5]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--blink-settings=imagesEnabled=false")
chrome_options.add_argument("headless")
chrome_options.add_argument("no-sandbox")
chrome_options.add_argument("disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)

In [15]:
BASE_URL = "https://ria.ru"
today = datetime.today()
start_date = datetime(2023, 1, 1)
work_path = Path('.')

In [16]:
def get_topic_html(BASE_URL, topic, step):
    try:
        news = []
        URL = BASE_URL + '/' + topic
        driver.get(URL)
        time.sleep(2)

        # push to list 20 next articles
        driver.execute_script(
            "document.getElementsByClassName('list-more')[0].click()"
        )
        time.sleep(1)
        # scroll page to automatically load more articles
        for i in tqdm(range(1500), leave=False):
            try:
                driver.execute_script(
                    f"window.scrollTo(0, document.body.scrollHeight - 1200)"
                )
                time.sleep(0.15)
            except:
                pass

        # find all pages
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        scope = soup.find(
            "div", {"class": "list", "itemtype": "http://schema.org/ItemList"}
        )
        news += scope.find_all("div", {"class": "list-item"})
    except:
        pass
    return news

In [17]:
async def parse_page(page, topic):
    """Extract from page desired fields"""

    # Create article data class object
    article = Article()
    article.topic = ria_topics[topic]

    # article url
    article.url = page.find("a", {"class": "list-item__image"})["href"]

    # article id
    s = re.findall(r"\d+.html", article.url)[0]
    article.id = s[: s.find(".")]

    # load page
    driver.get(article.url)
    time.sleep(1)
    html = driver.page_source

    # article source
    source = article.url[8 : article.url.find(".")]

    # article object
    soup = BeautifulSoup(html, "html.parser")
    obj = soup.find(
        "div",
        {
            "class": lambda x: x and (x.find(f"article m-article m-{source}") > -1),
            "data-article-id": article.id,
        },
    )

    if not obj:
        obj = soup.find(
            "div",
            {
                "class": lambda x: x and (x.find(f"article m-video m-{source}") > -1),
                "data-article-id": article.id,
            },
        )

    # process article title
    title = obj.find("div", {"class": "article__title"})
    title_2 = obj.find("h1", {"class": "article__title"})

    if title:
        article.title = title.text
    else:
        article.title = title_2.text if title_2 else ""

    # article subtitle
    subtitle = obj.find("h1", {"class": "article__second-title"})
    article.subtitle = subtitle.text if subtitle else ""

    # article content
    article.content = obj.find(
        "div", {"class": "article__body js-mediator-article mia-analytics"}
    ).text

    # article datetime
    article.datetime = obj.find("div", {"class": "article__info-date"}).find("a").text

    # article number of views
    # article.views = int(obj.find('span', {'class': 'statistic__item m-views'}).text)

    return article

In [None]:
path_to_save = work_path.joinpath('data/ria_news')
for topic, topic_ru in ria_topics.items():
    topic_news = get_topic_html(BASE_URL, topic)
    random.shuffle(topic_news)
    iterator2 = tqdm(topic_news, total=len(topic_news))
    parsed_topic_news = [await parse_page(page, topic) for page in iterator2]
    
    save_jsonl([i.__dict__ for i in parsed_topic_news if i], path_to_save.joinpath('ria_{topic}.jsonl').__str__())
    time.sleep(60)

  0%|          | 0/1500 [00:00<?, ?it/s]