## Парсинг данных

### Лента

In [None]:
import asyncio
from datetime import date, timedelta
import time

import aiohttp
import requests
from bs4 import BeautifulSoup

WORKER_COUNT = 20

def get_cards_urls(url):
    time.sleep(0.2)
    res = requests.get(url)
    page = BeautifulSoup(res.text)
    cards = page.find_all("a", {"class": "card-full-news"})
    cards_urls = [x.attrs["href"] for x in cards]
    cards_urls = [f"https://lenta.ru/{x}" for x in cards_urls]
    return cards_urls


def get_all_cards_urls(date):
    page_num = 1
    cards_urls = []
    while True:
        month = date.month if date.month >= 10 else "0" + str(date.month)
        day = date.day if date.day >= 10 else "0" + str(date.day)
        url = f"https://lenta.ru/news/{date.year}/{month}/{day}/page/{page_num}"
        print(url)
        try:
            urls = get_cards_urls(url)
            if not urls:
                break
        except:
            pass
        cards_urls.extend(get_cards_urls(url))
        page_num += 1

    return cards_urls


async def save_page(url):
    print("save page:", url)

    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            text = await response.text()

    page = BeautifulSoup(text)
    topic = page.find("a", {"class": "topic-header__rubric"}).text
    header = page.find("span", {"class": "topic-body__title"}).text
    text = " ".join([x.text for x in page.find_all("p", {"class": "topic-body__content-text"})])
    filename = f"./lentaru/{time.time()}.txt"
    file = open(filename, "w", encoding="utf8")
    file.write(f"{url}\n{topic}\n{header}\n{text}")
    file.close()


async def url_getter(queue):
    d = date(year=2023, month=4, day=4)
    while True:
        print(d)
        try:
            urls = get_all_cards_urls(d)
        except:
            pass
        [queue.put_nowait(x) for x in urls]
        d -= timedelta(days=1)
        await queue.join()


async def text_getter(queue):
    while True:
        url = await queue.get()
        try:
            await save_page(url)
        except:
            pass
        queue.task_done()


async def main():
    queue = asyncio.Queue()
    await asyncio.gather(
        url_getter(queue),
        *[text_getter(queue) for x in range(WORKER_COUNT)]
    )

asyncio.run(main())

### Фонтанка

In [None]:
import requests
import datetime
import json
import time

from bs4 import BeautifulSoup

date = datetime.date(year=2023, month=12, day=31)


while True:
    date_str = date.strftime("%d.%m.%Y")
    page = 0
    page_urls = []
    print("get page urls")
    while True:
        page += 1
        url = f"https://newsapi.fontanka.ru/v1/public/fontanka/services/archive/?regionId=478&page={page}&pagesize=20&date={date_str}&rubricId=all"
        print(url)
        res = requests.get(url)

        if res.json()["data"] is None:
            break

        for item in res.json()["data"]:
            if "https://www.fontanka.ru/" in item["urls"]["urlCanonical"]:
                rubrics = [x["name"] for x in item["rubrics"]]
                page_urls.append((item["urls"]["urlCanonical"], rubrics))

    date -= datetime.timedelta(days=1)

    data = []
    for url, topics in page_urls:
        print("get text:", url)
        try:
            res = requests.get(url)
            page = BeautifulSoup(res.text)
            ps = page.find("section", {"itemprop": "articleBody"}).find_all("p")
            text = "".join([x.text for x in ps])
            data.append((text, topics))
        except:
            pass

    file = open(f"./data/{date_str}.txt", 'w')
    file.write(json.dumps(data))
    file.close()

## Обработка данных и обучение

### Обработка данных

In [2]:
import pandas as pd
import numpy as np

In [29]:
data = pd.read_csv("data.csv")

In [30]:
data

Unnamed: 0,text,topic
0,Комиссия по вопросам топонимики и охраны истор...,Бывший СССР
1,Компания Apple может выпустить iPhone 8 в 2018...,Наука и техника
2,В Петербурге сотрудники ОМОН «Бастион» Росгвар...,Силовые структуры
3,Группа английских ученых из Imperial College и...,Наука и техника
4,Вечером в пятницу российские миротворцы в Южно...,Бывший СССР
...,...,...
100953,Россия находится на первом месте в мире по чис...,Путешествия
100954,Вратарь «Ботафого» и сборной Бразилии Жефферсо...,Спорт
100955,"Президент России Владимир Путин в пятницу, 4 н...",Россия
100956,Акции американской компании Virgin Galactic пр...,Экономика


In [31]:
from sklearn.utils import shuffle

X = data.copy()

X.loc[X["topic"] == "Россия", "topic"] = 0
X.loc[X["topic"] == "Общество", "topic"] = 0
X.loc[X["topic"] == "Экономика", "topic"] = 1
X.loc[X["topic"] == "Силовые структуры", "topic"] = 2
X.loc[X["topic"] == "Бывший СССР", "topic"] = 3
X.loc[X["topic"] == "Спорт", "topic"] = 4
X.loc[X["topic"] == "Забота о себе", "topic"] = 5
X.loc[X["topic"] == "Строительство", "topic"] = 6
X.loc[X["topic"] == "Путешествия", "topic"] = 7
X.loc[X["topic"] == "Туризм", "topic"] = 7
X.loc[X["topic"] == "Наука и техника", "topic"] = 8

X = shuffle(X)
Y = X["topic"]
X = X["text"]

#### Удаление стоп-слов

In [32]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize
from tqdm import tqdm

russian_stop_words = stopwords.words("russian")
noise = list(punctuation) + russian_stop_words + ["«", "»"]

res = []
for v in tqdm(list(X)):
    r = " ".join([x for x in word_tokenize(v) if x not in noise])
    res.append(r)
X = pd.DataFrame(res)[0]

[nltk_data] Downloading package punkt to
[nltk_data]     /home/andreyserov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/andreyserov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████████████████████████████████████████████████████████████████████| 100958/100958 [01:51<00:00, 904.80it/s]


#### Лемматизация

In [33]:
from pymystem3 import Mystem
m = Mystem()

res = []
for v in tqdm(list(X)):
    r = m.lemmatize(v)
    res.append("".join(r))
X = pd.DataFrame(res)[0]

100%|██████████████████████████████████████████████████████████████████████████| 100958/100958 [04:51<00:00, 346.20it/s]


In [34]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)
len(x_train), len(y_train), len(x_test), len(y_test)

(90862, 90862, 10096, 10096)

#### TF-IDF

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

tfidf_vec = TfidfVectorizer(smooth_idf=True, min_df=0.001, ngram_range=(1, 4))

x_train_tdidf = tfidf_vec.fit_transform(x_train)
x_test_tdidf = tfidf_vec.transform(x_test)
x_train_tdidf

<90862x20860 sparse matrix of type '<class 'numpy.float64'>'
	with 11758462 stored elements in Compressed Sparse Row format>

### Обучения

In [40]:
from keras.models import Sequential
from keras.layers import Flatten, Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from keras.layers import BatchNormalization, Activation, Dropout
from keras.optimizers import Nadam

model = Sequential()
for _ in range(10):
    model.add(Dense(400, kernel_initializer="he_normal", use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(rate=0.1))
model.add(Dense(9, activation="softmax"))

optimizer = Nadam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [41]:
model.fit(
    x_train_tdidf.toarray(), y_train.astype(np.int32),
    epochs=5,
    validation_data=(x_test_tdidf.toarray(), y_test.astype(np.int32)),
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f9e68233a90>