In [1]:
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from datetime import datetime
import time
from datetime import timedelta

from dataclasses import dataclass
import re

In [2]:
def save_jsonl(json_list, output_file_path):
    with open(output_file_path, 'a', encoding="utf-8") as output_file:
        for sample in json_list:
            json_line = json.dumps(sample, ensure_ascii=False)
            #json.dumps(sample, output_file)
            output_file.write(json_line + '\n')

In [3]:
general_topics = {
    'Общество/Россия' : 0,
    'Экономика' : 1,
    'Силовые структуры' : 2,# https://russian.rt.com/trend/334946-armiya
    'Бывший СССР' : 3,#  https://russian.rt.com/ussr/news, https://lenta.ru/rubrics/ussr
    'Спорт' : 4,# https://russian.rt.com/sport/news
    'Забота о себе' : 5,
    'Строительство' : 6,
    'Туризм/Путешествия' : 7,
    'Наука и техника' : 8#https://russian.rt.com/science/news
    }

In [4]:
ria_topics = {
    #"economy": 'Экономика', 
    #"society" : 'Общество/Россия', 
    #"incidents", 
    #"science": 'Наука и техника', 
    #"culture", 
    "defense_safety": 'Силовые структуры', 
    "tourism": 'Туризм/Путешествия'
    }

In [5]:
@dataclass
class Article:
    id: str = None
    url: str = None
    title: str = None
    subtitle: str = None
    topic: str = None
    content: str = None
    datetime: str = None

In [6]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--blink-settings=imagesEnabled=false")
chrome_options.add_argument("headless")
chrome_options.add_argument("no-sandbox")
chrome_options.add_argument("disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)

## ria.ru

In [7]:
BASE_URL = "https://ria.ru"
today = datetime.today()
start_date = datetime(2023, 6, 1)

In [8]:
def get_topic_html(BASE_URL, topic, step):
    n=0
    news = []
    archive_date = start_date
    iterator = tqdm(range((today - start_date).days//step), total=(today - start_date).days//step)
    for _ in iterator:
        try:
            archive_url = f'{BASE_URL}/{topic}/{archive_date.strftime("%Y%m%d")}'
            driver.get(archive_url)
            driver.execute_script(
                        "document.getElementsByClassName('list-more')[0].click()"
                    )
            # scroll page to automatically load more articles
            for _ in range(10):
                try:
                    driver.execute_script(
                        f"window.scrollTo(0, document.body.scrollHeight - 1200)"
                    )
                    time.sleep(0.25)
                except:
                    pass
            
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            scope = soup.find(
                "div", {"class": "list", "itemtype": "http://schema.org/ItemList"}
            )
            news += scope.find_all("div", {"class": "list-item"})
            archive_date += timedelta(step)
        except:
            n+=1
            iterator.set_description(f'skip_dates={n}')
            continue
    return news

In [9]:
async def parse_page(page, topic):
    try:
        """Extract from page desired fields"""

        # Create article data class object
        article = Article()
        article.topic = ria_topics[topic]

        # article url
        article.url = page.find("a", {"class": "list-item__image"})["href"]

        # article id
        s = re.findall(r"\d+.html", article.url)[0]
        article.id = s[: s.find(".")]

        # load page
        driver.get(article.url)
        time.sleep(1)
        html = driver.page_source

        # article source
        source = article.url[8 : article.url.find(".")]

        # article object
        soup = BeautifulSoup(html, "html.parser")
        obj = soup.find(
            "div",
            {
                "class": lambda x: x and (x.find(f"article m-article m-{source}") > -1),
                "data-article-id": article.id,
            },
        )

        if not obj:
            obj = soup.find(
                "div",
                {
                    "class": lambda x: x and (x.find(f"article m-video m-{source}") > -1),
                    "data-article-id": article.id,
                },
            )

        # process article title
        title = obj.find("div", {"class": "article__title"})
        title_2 = obj.find("h1", {"class": "article__title"})

        if title:
            article.title = title.text
        else:
            article.title = title_2.text if title_2 else ""

        # article subtitle
        subtitle = obj.find("h1", {"class": "article__second-title"})
        article.subtitle = subtitle.text if subtitle else ""

        # article content
        article.content = obj.find(
            "div", {"class": "article__body js-mediator-article mia-analytics"}
        ).text

        # article datetime
        article.datetime = obj.find("div", {"class": "article__info-date"}).find("a").text

        # article number of views
        # article.views = int(obj.find('span', {'class': 'statistic__item m-views'}).text)

        return article
    except:
        return None

In [10]:
ria_topics = {
    #"economy": 'Экономика', 
    #"society" : 'Общество/Россия', 
    #"incidents", 
    #"science": 'Наука и техника', 
    #"culture", 
    "defense_safety": 'Силовые структуры', 
    "tourism": 'Туризм/Путешествия'
    }

In [11]:
import json
import random

In [None]:
path_to_save = '/workspaces/ML_HSE/HW3/parsing/data/ria_news/'
for topic, topic_ru in ria_topics.items():
    topic_news = get_topic_html(BASE_URL, topic, 5)
    random.shuffle(topic_news)
    iterator2 = tqdm(topic_news, total=len(topic_news))
    parsed_topic_news = [await parse_page(page, topic) for page in iterator2]
    
    save_jsonl([i.__dict__ for i in parsed_topic_news if i], path_to_save+f'ria_{topic}.jsonl')

  0%|          | 0/114 [00:00<?, ?it/s]

  0%|          | 0/3063 [00:00<?, ?it/s]

##  Lenta.ru

In [23]:
import requests as rq
import json

In [24]:
@dataclass
class Article:
    id: str = None
    url: str = None
    title: str = None
    subtitle: str = None
    topic: str = None
    content: str = None
    datetime: str = None

In [25]:
lenta_topics = {
    1 : 'Общество',
    4 : 'Экономика',
    37 : 'Силовые структуры',
    3 : 'Бывший СССР',
    8 : 'Спорт',
    87: 'Забота о себе',
    48: 'Туризм',
    5 : 'Наука и техника'
}

In [33]:
class lentaRu_parser:
    def __init__(self):
        pass

    def _get_url(self, param_dict: dict) -> str:
        """
        Возвращает URL для запроса json таблицы со статьями

        url = 'https://lenta.ru/search/v2/process?'\
        + 'from=0&'\                       # Смещение
        + 'size=1000&'\                    # Кол-во статей
        + 'sort=2&'\                       # Сортировка по дате (2), по релевантности (1)
        + 'title_only=0&'\                 # Точная фраза в заголовке
        + 'domain=1&'\                     # ??
        + 'modified%2Cformat=yyyy-MM-dd&'\ # Формат даты
        + 'type=1&'\                       # Материалы. Все материалы (0). Новость (1)
        + 'bloc=4&'\                       # Рубрика. Экономика (4). Все рубрики (0)
        + 'modified%2Cfrom=2020-01-01&'\
        + 'modified%2Cto=2020-11-01&'\
        + 'query='                         # Поисковой запрос
        """
        hasType = int(param_dict['type']) != 0
        hasBloc = int(param_dict['bloc']) != 0

        url = 'https://lenta.ru/search/v2/process?'\
        + 'from={}&'.format(param_dict['from'])\
        + 'size={}&'.format(param_dict['size'])\
        + 'sort={}&'.format(param_dict['sort'])\
        + 'title_only={}&'.format(param_dict['title_only'])\
        + 'domain={}&'.format(param_dict['domain'])\
        + 'modified%2Cformat=yyyy-MM-dd&'\
        + 'type={}&'.format(param_dict['type']) * hasType\
        + 'bloc={}&'.format(param_dict['bloc']) * hasBloc\
        + 'modified%2Cfrom={}&'.format(param_dict['dateFrom'])\
        + 'modified%2Cto={}&'.format(param_dict['dateTo'])\
        + 'query={}'.format(param_dict['query'])

        return url


    def _get_search_table(self, param_dict: dict) -> pd.DataFrame:
        """
        Возвращает pd.DataFrame со списком статей
        """
        url = self._get_url(param_dict)
        r = rq.get(url)
        search_table = [
            {
                'id': i.pop('docid'), 
                'url': i.pop('url'), 
                'title': i.pop('title'), 
                'subtitle': None, 
                'topic': lenta_topics[i.pop('bloc')], 
                'content': i.pop('text'), 
                'datetime': datetime.fromtimestamp(i.pop('modified')).strftime('%H:%M %d.%m.%Y')
                } for i in r.json()['matches']
            ]

        return search_table


    def get_articles(self,
                     param_dict,
                     time_step = 37,):
        """
        Функция для скачивания статей интервалами через каждые time_step дней
        Делает сохранение таблицы через каждые save_every * time_step дней

        param_dict: dict
        ### Параметры запроса
        ###### project - раздел поиска, например, rbcnews
        ###### category - категория поиска, например, TopRbcRu_economics
        ###### dateFrom - с даты
        ###### dateTo - по дату
        ###### offset - смещение поисковой выдачи
        ###### limit - лимит статей, максимум 100
        ###### query - поисковой запрос (ключевое слово), например, РБК

        """
        param_copy = param_dict.copy()
        timedelta_s = param_copy.pop('timedelta_step')
        time_step = timedelta(days=time_step)
        dateFrom = datetime.strptime(param_copy['dateFrom'], '%Y-%m-%d')
        dateTo = datetime.strptime(param_copy['dateTo'], '%Y-%m-%d')
        if dateFrom > dateTo:
            raise ValueError('dateFrom should be less than dateTo')

        out = []
        save_counter = 0

        while dateFrom <= dateTo:
            param_copy['dateTo'] = (dateFrom + time_step).strftime('%Y-%m-%d')
            if dateFrom + time_step > dateTo:
                param_copy['dateTo'] = dateTo.strftime('%Y-%m-%d')
            print('Parsing articles from ' + param_copy['dateFrom'] +  ' to ' + param_copy['dateTo'])
            out+= self._get_search_table(param_copy)
            dateFrom += time_step + timedelta(days=timedelta_s)
            param_copy['dateFrom'] = dateFrom.strftime('%Y-%m-%d')
            save_counter += 1

        save_jsonl(out, work_path.joinpath('data/lenta').joinpath(f'lenta_{lenta_topics[int(param_dict["bloc"])]}.jsonl')
        print('Finish')

        return out

  """


In [34]:
# Задаем тут параметры
query = 'РБК'
offset = 0
size = 1000
sort = "3"
title_only = "0"
domain = "1"
material = "0"
bloc = "1" # topic = тематика новости
dateFrom = '2023-01-01'
dateTo = "2024-12-23"
timedelta_step = 3

param_dict = {'query'     : query,
              'timedelta_step': timedelta_step,
              'from'      : str(offset),
              'size'      : str(size),
              'dateFrom'  : dateFrom,
              'dateTo'    : dateTo,
              'sort'      : sort,
              'title_only': title_only,
              'type'      : material,
              'bloc'      : bloc,
              'domain'    : domain}

print("param_dict:", param_dict)

param_dict: {'query': 'РБК', 'timedelta_step': 3, 'from': '0', 'size': '1000', 'dateFrom': '2023-01-01', 'dateTo': '2024-12-23', 'sort': '3', 'title_only': '0', 'type': '0', 'bloc': '1', 'domain': '1'}


In [35]:
from IPython import display

In [37]:
for bloc_topic in lenta_topics:
    param_dict.update({'bloc': bloc_topic})
    parser = lentaRu_parser()
    tbl = parser.get_articles(param_dict=param_dict,
                              time_step = 180)
    print(f'DONE {lenta_topics[bloc_topic]} = {len(tbl)}')

Parsing articles from 2023-01-01 to 2023-06-30
Parsing articles from 2023-07-03 to 2023-12-30
Parsing articles from 2024-01-02 to 2024-06-30
Parsing articles from 2024-07-03 to 2024-12-23
Finish
DONE Общество = 625
Parsing articles from 2023-01-01 to 2023-06-30
Parsing articles from 2023-07-03 to 2023-12-30
Parsing articles from 2024-01-02 to 2024-06-30
Parsing articles from 2024-07-03 to 2024-12-23
Finish
DONE Экономика = 1996
Parsing articles from 2023-01-01 to 2023-06-30
Parsing articles from 2023-07-03 to 2023-12-30
Parsing articles from 2024-01-02 to 2024-06-30
Parsing articles from 2024-07-03 to 2024-12-23
Finish
DONE Силовые структуры = 291
Parsing articles from 2023-01-01 to 2023-06-30
Parsing articles from 2023-07-03 to 2023-12-30
Parsing articles from 2024-01-02 to 2024-06-30
Parsing articles from 2024-07-03 to 2024-12-23
Finish
DONE Бывший СССР = 634
Parsing articles from 2023-01-01 to 2023-06-30
Parsing articles from 2023-07-03 to 2023-12-30
Parsing articles from 2024-01-02