In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/DeepGap')

In [3]:
# @title 1. Setup and Imports
# Install the transformers library if not already installed
!pip install transformers torch pandas tqdm

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.notebook import tqdm

# --- Device Configuration ---
# This is the most crucial step for performance.
# It will automatically use a GPU if one is available (like in Colab).
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if device.type == 'cuda':
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU Name: Tesla T4


In [4]:
# # @title 2. Create a Sample `sentiment.csv` (Skip if you have your own)
# data = {
#     'title': [
#         'Tech Giant Reports Record Quarterly Earnings',
#         'Federal Reserve Hikes Interest Rates Amid Inflation Concerns',
#         'New Product Launch Exceeds Market Expectations',
#         'Company Faces Major Data Breach, Stock Tumbles',
#         'Analysts Remain Neutral on Upcoming Earnings Report',
#         'Merger Talks Between Two Leading Firms Break Down',
#         'Biotech Stock Soars on Positive Drug Trial Results',
#         'Retail Sales Slump, Signaling Economic Slowdown'
#     ],
#     'text': [
#         'The company announced a 20% increase in revenue, beating all analyst predictions. The CEO expressed strong optimism for the next quarter, citing robust demand for their new cloud services.',
#         'In a move to curb rising inflation, the central bank raised its key interest rate by 0.5 percentage points. Markets reacted negatively, with major indexes falling over 2% on the news.',
#         'The latest smartphone model was met with critical acclaim and long lines at stores worldwide. Initial sales figures suggest this will be the best-selling product in the company\'s history.',
#         'A cybersecurity firm revealed a massive breach affecting millions of customer records. The company\'s stock price plunged by over 15% in pre-market trading as investors reacted to the news.',
#         'While the company showed steady growth, experts are waiting for more details on their long-term strategy before issuing a definitive buy or sell rating. The stock is expected to trade sideways in the near term.',
#         'After weeks of negotiations, the proposed multi-billion dollar merger has been called off. Both companies cited irreconcilable differences over valuation as the primary reason for the failure.',
#         'The experimental drug for treating a rare disease showed a 95% success rate in Phase 3 trials. The company now seeks fast-track approval from regulatory bodies, sending its stock price soaring.',
#         'Data released today showed a sharp decline in consumer spending for the third consecutive month. Economists are now warning of a potential recession if the trend continues.'
#     ]
# }

# df = pd.DataFrame(data)
# # Save the dataframe to a csv file
# df.to_csv('sentiment.csv', index=False)

# print("Sample 'sentiment.csv' created successfully.")
# display(df.head())

In [5]:
# data = {
#     'title': [
#         'Технологический гигант сообщил о рекордной квартальной прибыли',
#         'ЦБ повышает ключевую ставку на фоне опасений по поводу инфляции',
#         'Запуск нового продукта превзошел ожидания рынка',
#         'Компания столкнулась с крупной утечкой данных, акции падают',
#         'Аналитики сохраняют нейтральный взгляд на предстоящий отчет',
#         'Переговоры о слиянии между двумя ведущими компаниями сорвались',
#         'Акции биотехнологической компании взлетают на фоне положительных результатов испытаний',
#         'Розничные продажи падают, сигнализируя об экономическом замедлении'
#     ],
#     'text': [
#         'Компания объявила о росте выручки на 20%, превзойдя все прогнозы аналитиков. Генеральный директор выразил сильный оптимизм относительно следующего квартала, сославшись на высокий спрос на их новые облачные сервисы.',
#         'В рамках борьбы с растущей инфляцией центральный банк повысил ключевую ставку на 0,5 процентного пункта. Рынки отреагировали негативно, основные индексы упали более чем на 2% на этой новости.',
#         'Последняя модель смартфона была встречена восторженными отзывами и длинными очередями в магазинах по всему миру. Предварительные данные о продажах показывают, что это станет самым продаваемым продуктом в истории компании.',
#         'Кибербезопасная компания выявила масштабную утечку, затронувшую миллионы записей клиентов. Цена акций компании рухнула более чем на 15% в торговле до открытия рынка, поскольку инвесторы отреагировали на новости.',
#         'Хотя компания показала стабильный рост, эксперты ждут более подробной информации о ее долгосрочной стратегии, прежде чем выдавать окончательную рекомендацию. Ожидается, что в краткосрочной перспективе акция будет торговаться в боковом диапазоне.',
#         'После недель переговоров предлагаемое слияние на миллиарды долларов было отменено. Обе компании назвали непреодолимые разногласия по оценке в качестве основной причины провала.',
#         'Экспериментальный препарат для лечения редкого заболевания показал 95% эффективность в испытаниях фазы 3. Компания теперь ищет ускоренного одобрения у регуляторных органов, что приводит к взрывному росту цены ее акций.',
#         'Опубликованные сегодня данные показали резкое снижение потребительских расходов в третий месяц подряд. Экономисты теперь предупреждают о возможной рецессии, если эта тенденция продолжится.'
#     ]
# }

# df = pd.DataFrame(data)
# # Save the dataframe to a csv file
# df.to_csv('sentiment_ru.csv', index=False)

# print("Sample 'sentiment.csv' with Russian text created successfully.")
# display(df.head())

In [6]:
# @title 3. Load FinBERT Model and Tokenizer
# Load model directly
# tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
# model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
# model_name = 'FinBERT'

tokenizer = AutoTokenizer.from_pretrained("mxlcw/rubert-tiny2-russian-financial-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("mxlcw/rubert-tiny2-russian-financial-sentiment")
model_name = 'rubert_tiny'

# Move the model to the configured device (GPU or CPU)
model.to(device)
model.eval() # Set the model to evaluation mode

print(f"{model_name} model and tokenizer loaded successfully.")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/922 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

rubert_tiny model and tokenizer loaded successfully.


In [7]:
BATCH_SIZE = 32 # Adjust based on your GPU memory. 16 or 32 is a good start.
def join_sentiment_columns(df):
  # Combine title and text for a more comprehensive analysis
  # We fill NaN values with empty strings to avoid errors
  df['combined_text'] = df['title'].fillna('') + ". " + df['text'].fillna('')
  texts_to_analyze = df['combined_text'].tolist()

  # --- Batch Processing ---
  all_probabilities = []
  all_labels = []
  all_scores = []

  # The tokenizer can handle a list of texts, which is much more efficient
  # than a for loop. We process the texts in batches.
  print(f"Analyzing {len(texts_to_analyze)} texts in batches of {BATCH_SIZE}...")

  for i in tqdm(range(0, len(texts_to_analyze), BATCH_SIZE)):
      # Get a batch of texts
      batch_texts = texts_to_analyze[i:i + BATCH_SIZE]

      # Tokenize the batch
      inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

      # Move inputs to the same device as the model
      inputs = {k: v.to(device) for k, v in inputs.items()}

      # --- Inference ---
      # Use torch.no_grad() to disable gradient calculation. This is CRUCIAL for speed
      # and memory efficiency during inference.
      with torch.no_grad():
          outputs = model(**inputs)

      # Get the probabilities using softmax
      probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

      # Move results back to CPU and convert to numpy
      probs_cpu = probabilities.cpu().numpy()

      # Store results
      all_probabilities.extend(probs_cpu)

      # Get the label and score for each item in the batch
      for prob in probs_cpu:
          prediction_index = np.argmax(prob)
          label = model.config.id2label[prediction_index]

          # Create a sentiment score: positive=1, neutral=0, negative=-1
          # This is a common way to represent sentiment as a single number
          sentiment_score = prob[1] * 1 + prob[0] * 0 + prob[2] * -1

          all_labels.append(label)
          all_scores.append(sentiment_score)

  # --- Finalize DataFrame ---
  # Add the results to the original dataframe
  df['sentiment_label'] = all_labels
  df['sentiment_score'] = all_scores

  # Add the individual probabilities
  # We stack the list of arrays into a single 2D array
  prob_array = np.vstack(all_probabilities)
  df['prob_negative'] = prob_array[:, 2]
  df['prob_neutral'] = prob_array[:, 0]
  df['prob_positive'] = prob_array[:, 1]

  print("\nAnalysis complete!")

In [8]:
DATA_DIR = os.path.join('data', 'sentiment')

In [None]:
# df = pd.read_csv(os.path.join(DATA_DIR, 'result_polus.csv'))
# df

In [None]:
# join_sentiment_columns(df)
# df

In [None]:
# df.loc[df['sentiment_label'] == 'negative', :]

In [None]:
# df = pd.read_csv('sentiment_ru.csv')
# join_sentiment_columns(df)
# df

In [13]:
file_list = os.listdir(DATA_DIR)
list(f[:-9] for f in file_list if f.endswith('_news.csv'))

['VTBR',
 'MOEX',
 'MAGN',
 'CBOM',
 'FESH',
 'IRKT',
 'POSI',
 'MVID',
 'AFLT',
 'ALRS',
 'AFKS',
 'BSBP',
 'BELU',
 'SNGSP',
 'CHMF',
 'PHOR',
 'MTSS',
 'RUAL',
 'RTKM',
 'HYDR']

In [14]:
# --- Configuration ---
# CSV_FILE_PATH = 'sentiment.csv'

# --- Data Loading and Preprocessing ---
# Load your data
# df = pd.read_csv(CSV_FILE_PATH)

for fname in os.listdir(DATA_DIR):
  if not fname.endswith('_news.csv'):
    continue
  ticker = os.path.splitext(fname)[0][:-5]
  sentiment_path = os.path.join(DATA_DIR, f'{ticker}_sentiment.csv')
  if os.path.exists(sentiment_path):
    continue
  df = pd.read_csv(os.path.join(DATA_DIR, fname))
  join_sentiment_columns(df)
  df.to_csv(sentiment_path)

Analyzing 242 texts in batches of 32...


  0%|          | 0/8 [00:00<?, ?it/s]


Analysis complete!
Analyzing 161 texts in batches of 32...


  0%|          | 0/6 [00:00<?, ?it/s]


Analysis complete!
Analyzing 210 texts in batches of 32...


  0%|          | 0/7 [00:00<?, ?it/s]


Analysis complete!
Analyzing 42 texts in batches of 32...


  0%|          | 0/2 [00:00<?, ?it/s]


Analysis complete!
Analyzing 28 texts in batches of 32...


  0%|          | 0/1 [00:00<?, ?it/s]


Analysis complete!
Analyzing 20 texts in batches of 32...


  0%|          | 0/1 [00:00<?, ?it/s]


Analysis complete!
Analyzing 49 texts in batches of 32...


  0%|          | 0/2 [00:00<?, ?it/s]


Analysis complete!
Analyzing 40 texts in batches of 32...


  0%|          | 0/2 [00:00<?, ?it/s]


Analysis complete!
Analyzing 173 texts in batches of 32...


  0%|          | 0/6 [00:00<?, ?it/s]


Analysis complete!
Analyzing 180 texts in batches of 32...


  0%|          | 0/6 [00:00<?, ?it/s]


Analysis complete!
Analyzing 104 texts in batches of 32...


  0%|          | 0/4 [00:00<?, ?it/s]


Analysis complete!
Analyzing 152 texts in batches of 32...


  0%|          | 0/5 [00:00<?, ?it/s]


Analysis complete!
Analyzing 21 texts in batches of 32...


  0%|          | 0/1 [00:00<?, ?it/s]


Analysis complete!
Analyzing 135 texts in batches of 32...


  0%|          | 0/5 [00:00<?, ?it/s]


Analysis complete!
Analyzing 106 texts in batches of 32...


  0%|          | 0/4 [00:00<?, ?it/s]


Analysis complete!
Analyzing 99 texts in batches of 32...


  0%|          | 0/4 [00:00<?, ?it/s]


Analysis complete!
Analyzing 223 texts in batches of 32...


  0%|          | 0/7 [00:00<?, ?it/s]


Analysis complete!
Analyzing 143 texts in batches of 32...


  0%|          | 0/5 [00:00<?, ?it/s]


Analysis complete!
Analyzing 87 texts in batches of 32...


  0%|          | 0/3 [00:00<?, ?it/s]


Analysis complete!
Analyzing 90 texts in batches of 32...


  0%|          | 0/3 [00:00<?, ?it/s]


Analysis complete!
