In [22]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, time, date
import pymorphy3
import re
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec, KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
import yfinance as yf
import numpy as np
from transformers import AutoTokenizer, BertModel, AutoModelForSequenceClassification
import torch
import moexalgo


In [23]:
df = pd.read_csv("./parser/data.tsv", sep='\t')
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50174 entries, 0 to 50173
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  50174 non-null  int64 
 1   text        50174 non-null  object
 2   date        50174 non-null  object
 3   views       50174 non-null  int64 
 4   tag_text    50174 non-null  object
 5   href        50174 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.3+ MB


Unnamed: 0.1,Unnamed: 0,text,date,views,tag_text,href
0,0,Владельцы АЗС Teboil в Финляндии предупредили ...,20:23,2043,"['Россия', 'США', 'Финляндия', 'Мария Захарова']","['/location_rossiyskaya-federatsiya/', '/locat..."
1,1,"Gunvor: продажа ""Лукойлом"" зарубежных активов ...",20:01,2842,"['Дубай', 'ЛУКОЙЛ', 'Gunvor Group']","['/tag_location_Dubajj/', '/organization_LUKOJ..."
2,2,Путин подписал закон о корректировке параметро...,19:19,2150,"['Россия', 'Владимир Путин', 'Антон Силуанов']","['/location_rossiyskaya-federatsiya/', '/perso..."
3,3,Путин подписал закон о сдаче в аренду федераль...,17:29,890,"['Россия', 'Владимир Путин']","['/location_rossiyskaya-federatsiya/', '/perso..."
4,4,Орешкин возглавит российскую делегацию на самм...,17:08,699,"['ЮАР', 'Россия', 'Большая двадцатка']","['/location_Republic_of_South_Africa/', '/loca..."
...,...,...,...,...,...,...
50169,5155,Еврокомиссия заявила о необходимости диалога с...,"24 мая 2022, 18:19",8511,"['Евросоюз', 'Еврокомиссия', 'Урсула фон дер Л...","['/organization_Evropejjskijj_sojuz/', '/organ..."
50170,5156,Комитет ГД одобрил право президента вводить сп...,"24 мая 2022, 18:13",2611,"['Россия', 'Николай Журавлев', 'Анатолий Аксак...","['/location_rossiyskaya-federatsiya/', '/perso..."
50171,5157,"Предприятие ""Роскосмоса"" будет производить дет...","24 мая 2022, 18:11",5030,"['Денис Мантуров', 'Технологии', 'Тольятти', '...","['/person/denis-manturov/', '/technology/', '/..."
50172,5158,Испания вызвалась заменить поставки российског...,"24 мая 2022, 18:07",60791,"['Испания', 'Евросоюз', 'Природный газ']","['/location_Spain/', '/organization_Evropejjsk..."


Обрабатываем данные: подгоняем дату под один формат, обрабатываем текст для последующего эмбеддинга iditf. Также сделаем эмбеддинг на основе word2vec, BERT

In [24]:
DEFAULT_DAY = 4
DEFAULT_MONTH = 11
DEFAULT_YEAR = 2025

month_dict = {"января": 1,
              "февраля": 2,
              "марта": 3,
              "апреля": 4,
              "мая": 5,
              "июня": 6,
              "июля": 7,
              "августа": 8,
              "сентября": 9,
              "октября": 10,
              "ноября": 11,
              "декабря": 12}

def date_processing(date_str):
    parts = date_str.split()

    if len(parts) == 1:
        day, month, year = DEFAULT_DAY, DEFAULT_MONTH, DEFAULT_YEAR
        clock = parts[0]

    if len(parts) == 2:
        day, month, year = DEFAULT_DAY - 1, DEFAULT_MONTH, DEFAULT_YEAR
        clock = parts[1]

    if len(parts) == 3:
        day, month, year = parts[0], month_dict[parts[1][:-1]], DEFAULT_YEAR
        clock = parts[2]

    if len(parts) == 4:
        day, month, year = parts[0], month_dict[parts[1]], parts[2][:-1]
        clock = parts[3]

    hour, minute = clock.split(':')

    return datetime(int(year), int(month), int(day), int(hour), int(minute)) 


df['date'] = df['date'].apply(date_processing)

In [25]:
# Найти значения, которые встречаются только один раз
unique_values = df['date'].value_counts()
single_occurrence = unique_values[unique_values == 1].index

# Отфильтровать DataFrame
df = df[df['date'].isin(single_occurrence)]

Добавим в наши данные таргеты, будем смотреть цены на акции, через определенные периоды после выхода новостей.

In [None]:
start_date, end_date = min(df['date']), max(df['date'])
print(start_date)

market = moexalgo.Ticker('ROSN')
prices = market.candles(start=start_date, end=end_date)

last_date = prices['end'].iloc[-1]
while last_date < end_date:
    temp_prices = market.candles(start=last_date, end=datetime.now())
    prices = pd.concat([prices, temp_prices], axis=0) 
    last_date = temp_prices['end'].iloc[-1]

interval = pd.Timedelta('10 minutes')

df = df.sort_values('date')
df['30 minutes'] = df['date'] + pd.Timedelta('30 minutes')
df['1 hour'] = df['date'] + pd.Timedelta('1 hour')

timedeltas = ['30 minutes', '1 hour', 'date']
temp_time = pd.Timedelta('0 minutes')
for i in range(1, 20):
    temp_time = temp_time + pd.Timedelta('10 minutes')
    df[f'{-i*10} minutes'] = df['date'] - temp_time
    timedeltas.append(f"-{i*10} minutes")
    
df_temp = {}

for timedelta in timedeltas:
    df_temp[timedelta] = pd.merge_asof(
        left=df,
        right=prices,
        left_on=timedelta,
        right_on='end',
        direction='backward',
        tolerance=interval
    )

    df[f'close + {timedelta}'] = df_temp[timedelta]['close']

df = df.drop(labels=['30 minutes', '1 hour', '-30 minutes', '-1 hour'], axis=1)
df = df.rename(columns={"close + date": 'close'})

2022-03-08 12:07:00


KeyError: 'Requested level (-0 minutes) does not match index name (None)'

Сделаем Word2Vec эмбеддинг предложения. Дообучим готовую модель из интернета:

Сделаем BERT embedding

In [None]:
MODEL_NAME = "cointegrated/rubert-tiny-sentiment-balanced"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
text = df['text'].tolist()

def get_sentiment_score(texts, batch_size=8):
    """
    Возвращает сентимент для списка текстов:
    - sentiment_label: 0 (негатив), 1 (нейтрально), 2 (позитив)
    - sentiment_score: от -1 до +1 (чем выше — тем позитивнее)
    """
    all_scores = []
    all_labels = []

    model.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors="pt", truncation=True,
                               padding=True, max_length=128)
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            labels = probs.argmax(dim=1).numpy()
            # score = p_pos - p_neg → диапазон примерно от -1 до +1
            scores = (probs[:, 2] - probs[:, 0]).numpy()
            all_scores.extend(scores)
            all_labels.extend(labels)

            if i % 10000 == 0:
              print(i)

    return np.array(all_scores), np.array(all_labels)


scores, labels = get_sentiment_score(text)
df['sent_scores'] = pd.Series(scores)
df['sent_labels'] = pd.Series(labels)

df = df.drop(labels=['text', 'tag_text', 'href', 'Unnamed: 0.1', 'Unnamed: 0'], axis=1)

first_valid_idx = df['close'].iloc[::-1].first_valid_index()
print(first_valid_idx)

if first_valid_idx is not None:
    df = df.loc[:first_valid_idx]
else:
    df = df.iloc[0:0]

n = 0
current_sum = 0 

for i, row in df.iterrows():
    if pd.isna(row['close']):
        current_sum += row['sent_scores']
        n += 1
    else:
        new_score = (current_sum + row['sent_scores']) / (n + 1)
        df.loc[i, 'sent_scores'] = new_score
        
        n = 0
        current_sum = 0

df = df.dropna(subset=['close'])
df.to_csv("./data/data_proc.csv")

'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /cointegrated/rubert-tiny-sentiment-balanced/resolve/main/tokenizer_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000023E9E165400>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: ab877194-569c-41c8-b3ee-952b7a47fa29)')' thrown while requesting HEAD https://huggingface.co/cointegrated/rubert-tiny-sentiment-balanced/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /cointegrated/rubert-tiny-sentiment-balanced/resolve/main/tokenizer_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000023E96CA25D0>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: f6e3f284-4b37-4374-b9ec-6ffcb9f34804)')' thrown wh

0
8
16
24
32
40
48
56
64
72
80
88
96
104
112
120
128
136
144
152
160
168
176
184
192
200
208
216
224
232
240
248
256
264
272
280
288
296
304
312
320
328
336
344
352
360
368
376
384
392
400
408
416
424
432
440
448
456
464
472
480
488
496
504
512
520
528
536
544
552
560
568
576
584
592
600
608
616
624
632
640
648
656
664
672
680
688
696
704
712
720
728
736
744
752
760
768
776
784
792
800
808
816
824
832
840
848
856
864
872
880
888
896
904
912
920
928
936
944
952
960
968
976
984
992
1000
1008
1016
1024
1032
1040
1048
1056
1064
1072
1080
1088
1096
1104
1112
1120
1128
1136
1144
1152
1160
1168
1176
1184
1192
1200
1208
1216
1224
1232
1240
1248
1256
1264
1272
1280
1288
1296
1304
1312
1320
1328
1336
1344
1352
1360
1368
1376
1384
1392
1400
1408
1416
1424
1432
1440
1448
1456
1464
1472
1480
1488
1496
1504
1512
1520
1528
1536
1544
1552
1560
1568
1576
1584
1592
1600
1608
1616
1624
1632
1640
1648
1656
1664
1672
1680
1688
1696
1704
1712
1720
1728
1736
1744
1752
1760
1768
1776
1784
1792
1800
1808
1816


41494


In [17]:
df_proc

Unnamed: 0,date,views,close,close + 30 minutes,close + 1 hour,close + -30 minutes,close + -1 hour,sent_scores,sent_labels
6861,2022-06-30 12:19:00,403903,394.35,393.75,394.35,394.40,394.35,,1.0
6862,2022-06-30 12:27:00,25432,394.45,394.30,394.35,394.80,395.00,-0.000452,1.0
6863,2022-06-30 12:31:00,663,390.90,390.80,389.65,391.20,390.80,0.000141,1.0
6864,2022-06-30 12:49:00,5098,391.00,390.95,389.90,390.95,391.10,0.214055,1.0
6865,2022-06-30 13:30:00,300,391.00,390.95,389.90,390.95,391.10,0.149373,1.0
...,...,...,...,...,...,...,...,...,...
41490,2025-10-02 22:21:00,570,369.05,372.90,365.40,369.95,,-0.992507,0.0
41491,2025-10-02 22:23:00,26340,358.25,370.40,,312.00,,0.054344,1.0
41492,2025-10-02 22:51:00,1200,369.95,369.05,372.90,,,-0.018896,1.0
41493,2025-10-03 00:44:00,258,312.00,358.25,370.40,,,0.004032,1.0
