In [3]:
import time, re, json, random
from glob import glob
import pandas as pd
from pathlib import Path
from selenium import webdriver
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from datetime import datetime, timedelta
from dataclasses import dataclass
from nltk import jaccard_distance
from pymystem3.mystem import Mystem
import numpy as np

In [4]:
def save_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding="utf-8") as output_file:
        for sample in json_list:
            json_line = json.dumps(sample, ensure_ascii=False)
            #json.dumps(sample, output_file)
            output_file.write(json_line + '\n')

def read_jsonl(read_file_path):
    with open(read_file_path, encoding="utf-8") as f:
        data = [json.loads(i) for i in f]
    return data

# Parsing

## ria_news

In [None]:
def save_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding="utf-8") as output_file:
        for sample in json_list:
            json_line = json.dumps(sample, ensure_ascii=False)
            #json.dumps(sample, output_file)
            output_file.write(json_line + '\n')

def read_jsonl(read_file_path):
    with open(read_file_path, encoding="utf-8") as f:
        data = [json.loads(i) for i in f]
    return data

In [None]:
ria_topics = {
    "economy": 'Экономика', 
    "society" : 'Общество/Россия', 
    "science": 'Наука и техника', 
    "defense_safety": 'Силовые структуры', 
    "tourism_news": 'Туризм/Путешествия'
    }

In [None]:
@dataclass
class Article:
    id: str = None
    url: str = None
    title: str = None
    subtitle: str = None
    topic: str = None
    content: str = None
    datetime: str = None

In [None]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--blink-settings=imagesEnabled=false")
chrome_options.add_argument("headless")
chrome_options.add_argument("no-sandbox")
chrome_options.add_argument("disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)

In [None]:
BASE_URL = "https://ria.ru"
today = datetime.today()
start_date = datetime(2023, 1, 1)
work_path = Path('.')

In [None]:
def get_topic_html(BASE_URL, topic, step):
    try:
        news = []
        URL = BASE_URL + '/' + topic
        driver.get(URL)
        time.sleep(2)

        # push to list 20 next articles
        driver.execute_script(
            "document.getElementsByClassName('list-more')[0].click()"
        )
        time.sleep(1)
        # scroll page to automatically load more articles
        for i in tqdm(range(1500), leave=False):
            try:
                driver.execute_script(
                    f"window.scrollTo(0, document.body.scrollHeight - 1200)"
                )
                time.sleep(0.15)
            except:
                pass

        # find all pages
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        scope = soup.find(
            "div", {"class": "list", "itemtype": "http://schema.org/ItemList"}
        )
        news += scope.find_all("div", {"class": "list-item"})
    except:
        pass
    return news

In [None]:
async def parse_page(page, topic):
    """Extract from page desired fields"""

    # Create article data class object
    article = Article()
    article.topic = ria_topics[topic]

    # article url
    article.url = page.find("a", {"class": "list-item__image"})["href"]

    # article id
    s = re.findall(r"\d+.html", article.url)[0]
    article.id = s[: s.find(".")]

    # load page
    driver.get(article.url)
    time.sleep(1)
    html = driver.page_source

    # article source
    source = article.url[8 : article.url.find(".")]

    # article object
    soup = BeautifulSoup(html, "html.parser")
    obj = soup.find(
        "div",
        {
            "class": lambda x: x and (x.find(f"article m-article m-{source}") > -1),
            "data-article-id": article.id,
        },
    )

    if not obj:
        obj = soup.find(
            "div",
            {
                "class": lambda x: x and (x.find(f"article m-video m-{source}") > -1),
                "data-article-id": article.id,
            },
        )

    # process article title
    title = obj.find("div", {"class": "article__title"})
    title_2 = obj.find("h1", {"class": "article__title"})

    if title:
        article.title = title.text
    else:
        article.title = title_2.text if title_2 else ""

    # article subtitle
    subtitle = obj.find("h1", {"class": "article__second-title"})
    article.subtitle = subtitle.text if subtitle else ""

    # article content
    article.content = obj.find(
        "div", {"class": "article__body js-mediator-article mia-analytics"}
    ).text

    # article datetime
    article.datetime = obj.find("div", {"class": "article__info-date"}).find("a").text

    # article number of views
    # article.views = int(obj.find('span', {'class': 'statistic__item m-views'}).text)

    return article

In [None]:
path_to_save = work_path.joinpath('data/ria_news')
for topic, topic_ru in ria_topics.items():
    topic_news = get_topic_html(BASE_URL, topic)
    random.shuffle(topic_news)
    iterator2 = tqdm(topic_news, total=len(topic_news))
    parsed_topic_news = [await parse_page(page, topic) for page in iterator2]
    
    save_jsonl([i.__dict__ for i in parsed_topic_news if i], path_to_save.joinpath('ria_{topic}.jsonl').__str__())
    time.sleep(60)

  0%|          | 0/1500 [00:00<?, ?it/s]

## lenta_news

In [None]:
def save_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding="utf-8") as output_file:
        for sample in json_list:
            json_line = json.dumps(sample, ensure_ascii=False)
            #json.dumps(sample, output_file)
            output_file.write(json_line + '\n')

def read_jsonl(read_file_path):
    with open(read_file_path, encoding="utf-8") as f:
        data = [json.loads(i) for i in f]
    return data

In [None]:
lenta_topics = {
    0 : 'Общество/Россия',
    4 : 'Экономика',
    37 : 'Силовые структуры',
    3 : 'Бывший СССР',
    8 : 'Спорт',
    87: 'Забота о себе',
    48: 'Туризм/Путешествия',
    5 : 'Наука и техника'
}
work_path = Path('.')

In [None]:
class lentaRu_parser:
    def __init__(self):
        pass

    def _get_url(self, param_dict: dict) -> str:
        """
        Возвращает URL для запроса json таблицы со статьями

        url = 'https://lenta.ru/search/v2/process?'\
        + 'from=0&'\                       # Смещение
        + 'size=1000&'\                    # Кол-во статей
        + 'sort=2&'\                       # Сортировка по дате (2), по релевантности (1)
        + 'title_only=0&'\                 # Точная фраза в заголовке
        + 'domain=1&'\                     # ??
        + 'modified%2Cformat=yyyy-MM-dd&'\ # Формат даты
        + 'type=1&'\                       # Материалы. Все материалы (0). Новость (1)
        + 'bloc=4&'\                       # Рубрика. Экономика (4). Все рубрики (0)
        + 'modified%2Cfrom=2020-01-01&'\
        + 'modified%2Cto=2020-11-01&'\
        + 'query='                         # Поисковой запрос
        """
        hasType = int(param_dict['type']) != 0
        hasBloc = int(param_dict['bloc']) != 0

        url = 'https://lenta.ru/search/v2/process?'\
        + 'from={}&'.format(param_dict['from'])\
        + 'size={}&'.format(param_dict['size'])\
        + 'sort={}&'.format(param_dict['sort'])\
        + 'title_only={}&'.format(param_dict['title_only'])\
        + 'domain={}&'.format(param_dict['domain'])\
        + 'modified%2Cformat=yyyy-MM-dd&'\
        + 'type={}&'.format(param_dict['type']) * hasType\
        + 'bloc={}&'.format(param_dict['bloc']) * hasBloc\
        + 'modified%2Cfrom={}&'.format(param_dict['dateFrom'])\
        + 'modified%2Cto={}&'.format(param_dict['dateTo'])\
        + 'query={}'.format(param_dict['query'])

        return url


    def _get_search_table(self, param_dict: dict) -> pd.DataFrame:
        """
        Возвращает pd.DataFrame со списком статей
        """
        url = self._get_url(param_dict)
        r = rq.get(url)
        search_table = [
            {
                'id': i.pop('docid'), 
                'url': i.pop('url'), 
                'title': i.pop('title'), 
                'subtitle': None, 
                'topic': lenta_topics[i.pop('bloc')], 
                'content': i.pop('text'), 
                'datetime': datetime.fromtimestamp(i.pop('modified')).strftime('%H:%M %d.%m.%Y')
                } for i in r.json()['matches']
            ]

        return search_table


    def get_articles(self,
                     param_dict,
                     time_step = 37,):
        """
        Функция для скачивания статей интервалами через каждые time_step дней
        Делает сохранение таблицы через каждые save_every * time_step дней

        param_dict: dict
        ### Параметры запроса
        ###### project - раздел поиска, например, rbcnews
        ###### category - категория поиска, например, TopRbcRu_economics
        ###### dateFrom - с даты
        ###### dateTo - по дату
        ###### offset - смещение поисковой выдачи
        ###### limit - лимит статей, максимум 100
        ###### query - поисковой запрос (ключевое слово), например, РБК

        """
        param_copy = param_dict.copy()
        timedelta_s = param_copy.pop('timedelta_step')
        time_step = timedelta(days=time_step)
        dateFrom = datetime.strptime(param_copy['dateFrom'], '%Y-%m-%d')
        dateTo = datetime.strptime(param_copy['dateTo'], '%Y-%m-%d')
        if dateFrom > dateTo:
            raise ValueError('dateFrom should be less than dateTo')

        out = []
        save_counter = 0

        while dateFrom <= dateTo:
            param_copy['dateTo'] = (dateFrom + time_step).strftime('%Y-%m-%d')
            if dateFrom + time_step > dateTo:
                param_copy['dateTo'] = dateTo.strftime('%Y-%m-%d')
            print('Parsing articles from ' + param_copy['dateFrom'] +  ' to ' + param_copy['dateTo'])
            out+= self._get_search_table(param_copy)
            dateFrom += time_step + timedelta(days=timedelta_s)
            param_copy['dateFrom'] = dateFrom.strftime('%Y-%m-%d')
            save_counter += 1

        save_jsonl(out, work_path.joinpath('data/lenta').joinpath(f'lenta_{lenta_topics[int(param_dict["bloc"])]}.jsonl').__str__())
        return out

In [None]:
# Задаем тут параметры
query = 'РБК'
offset = 0
size = 100000
sort = "3"
title_only = "0"
domain = "1"
material = "0"
bloc = "0" # topic = тематика новости
dateFrom = '2023-01-01'
dateTo = "2024-12-23"
timedelta_step = 3

param_dict = {
    'query'     : query,
    'timedelta_step': timedelta_step,
    'from'      : str(offset),
    'size'      : str(size),
    'dateFrom'  : dateFrom,
    'dateTo'    : dateTo,
    'sort'      : sort,
    'title_only': title_only,
    'type'      : material,
    'bloc'      : bloc,
    'domain'    : domain
    }

In [None]:
for bloc_topic in lenta_topics:
    param_dict.update({'bloc': bloc_topic})
    parser = lentaRu_parser()
    tbl = parser.get_articles(
        param_dict=param_dict,
        time_step = 180
        )
    print(f'DONE {lenta_topics[bloc_topic]} = {len(tbl)}')

# Preprocessing and EDA

In [5]:
work_path = Path('.').parent

In [None]:
all_df_path = glob(work_path.joinpath('data').joinpath('*/*.jsonl').__str__())

In [None]:
total_df = pd.concat([pd.DataFrame(read_jsonl(_path)) for _path in all_df_path]).drop_duplicates(subset=['url']).drop_duplicates(subset=['content']).reset_index(drop=True)

In [None]:
total_df.groupby('topic').agg({'id': 'count'})

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,634
Забота о себе,4
Наука и техника,2711
Общество,625
Общество/Россия,5346
Силовые структуры,5438
Спорт,103
Туризм,121
Туризм/Путешествия,2907
Экономика,8026


In [None]:
total_df.loc[total_df.topic == 'Общество', 'topic'] = 'Общество/Россия'
total_df.loc[total_df.topic == 'Забота о себе', 'topic'] = 'Спорт'
total_df.loc[total_df.topic == 'Туризм', 'topic'] = 'Туризм/Путешествия'

In [None]:
total_df.groupby('topic').agg({'id': 'count'})

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,634
Наука и техника,2711
Общество/Россия,5971
Силовые структуры,5438
Спорт,107
Туризм/Путешествия,3028
Экономика,8026


In [None]:
total_df.groupby('topic').agg({'id': 'count'}) / len(total_df)

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,0.024465
Наука и техника,0.104611
Общество/Россия,0.230407
Силовые структуры,0.20984
Спорт,0.004129
Туризм/Путешествия,0.116844
Экономика,0.309705


In [None]:
len(total_df)

25915

In [None]:
total_df.content.map(lambda x: len(x.split())).describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])

count    25915.000000
mean       251.835346
std        277.684801
min          0.000000
5%          74.000000
25%        125.000000
50%        182.000000
75%        270.000000
95%        696.300000
max       8099.000000
Name: content, dtype: float64

In [None]:
content_size = total_df.content.map(lambda x: len(x.split()))
total_df = total_df[(content_size > 74)&(content_size < 696)].reset_index(drop=True)


In [None]:
total_df.groupby('topic').agg({'id': 'count'})

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,621
Наука и техника,2186
Общество/Россия,5484
Силовые структуры,4509
Спорт,105
Туризм/Путешествия,2876
Экономика,7528


In [None]:
total_df.groupby('topic').agg({'id': 'count'}) / len(total_df)

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,0.026642
Наука и техника,0.093784
Общество/Россия,0.235274
Силовые структуры,0.193445
Спорт,0.004505
Туризм/Путешествия,0.123386
Экономика,0.322965


In [None]:
lemmatizer = Mystem()

In [None]:
def list_jaccard_score(target, texts):
    return [jaccard_distance(set(target), set(i)) for i in texts]

In [None]:
def jaccard_duplicate(seq_of_lem_text):
    unique_news_group = np.array([False]*len(seq_of_lem_text))
    iterator = tqdm(enumerate(seq_of_lem_text), total=len(seq_of_lem_text))
    for n, lem_text in iterator:
        if unique_news_group[n]:
            continue        
        jacc_scores = np.array(list_jaccard_score(lem_text, seq_of_lem_text[n+1:]))
        unique_news_group[n+1:] += jacc_scores <= 0.15
    return unique_news_group
        

In [None]:
lem_texts = []
texts = total_df.content
iterator = tqdm(texts, total=len(texts))
for text in iterator:
    lem_texts.append(lemmatizer.lemmatize(text))

  0%|          | 0/23309 [00:00<?, ?it/s]

In [None]:
duplicated = jaccard_duplicate(lem_texts)

  0%|          | 0/23309 [00:00<?, ?it/s]

In [None]:
total_df[duplicated == False].to_csv(work_path.absolute().parent.joinpath('train/full_data.csv').__str__(), index=False)