In [3]:
# =========================================
# 0. PACKAGES & CONSTANTS
# =========================================
import warnings, requests, numpy as np, pandas as pd
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, confusion_matrix
from sentence_transformers import SentenceTransformer
from catboost import CatBoostClassifier

warnings.simplefilter("ignore", RuntimeWarning)

FIN_CSV  =  "/kaggle/input/dataset/investing_news.csv"
GEO_CSV  =  "/kaggle/input/topperday/top_per_day_news.csv"

PCA_DIM  = 64          # per block → 128 total

In [4]:
import re
def load_fin(path: Path) -> pd.DataFrame:
    """Parse Investing.com dump even when title contains commas."""
    rows = []
    with open(path, encoding="utf-8-sig") as fh:
        next(fh)  # skip header
        for line in fh:
            line = line.strip().strip("\ufeff")
            m = re.match(r"^(.*?),(https?://[^,]+),(.*)$", line)
            if not m:
                continue
            title, url, raw = m.groups()
            date_match = re.search(r"\d{2}\.\d{2}\.\d{4}", raw)
            if not date_match:
                continue
            dt = pd.to_datetime(date_match.group(), format="%d.%m.%Y", errors="coerce")
            if pd.isna(dt):
                continue
            rows.append((dt.normalize(), title.strip(), url))
    df = pd.DataFrame(rows, columns=["date", "title", "url"])
    return df.assign(src="fin", weight=1.0)




def load_geo(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="utf-8-sig")
    # Expect columns: date,title,url,sim  (sim∈[0,1])
    if "date" not in df.columns:
        raise ValueError("В geo‑CSV нет столбца 'date'. Проверьте заголовки.")
    df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.normalize()
    df = df.dropna(subset=["date", "title"]).copy()
    df["weight"] = df["sim"].clip(0, 1).fillna(0.0)
    return df.loc[:, ["date", "title", "url", "weight"]].assign(src="geo")


fin_news = load_fin(FIN_CSV)
geo_news = load_geo(GEO_CSV)
print(f"FIN {len(fin_news):,} | GEO {len(geo_news):,}")

news = pd.concat([fin_news, geo_news], ignore_index=True)


FIN 1,343 | GEO 2,311


In [5]:

# =========================================
# 2. SENTENCE EMBEDDINGS & SIMPLE SENTIMENT
# =========================================
st_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

POS = ("рост","прибавил","увелич","рекорд")
NEG = ("падени","снизил","упал","минимум")

news["sent"] = news["title"].str.lower().apply(
    lambda t: np.tanh((sum(w in t for w in POS) - sum(w in t for w in NEG))/2)
)
news["emb"] = list(st_model.encode(news["title"].tolist(), batch_size=128, show_progress_bar=False))

# =========================================
# 3. DAILY AGGREGATION (separate FIN / GEO)
# =========================================

def aggregate_block(df: pd.DataFrame) -> pd.DataFrame:
    """Return daily DF with weighted mean embedding & sentiment."""
    def agg_day(day_df):
        w = day_df["weight"].to_numpy()
        sent_w = np.average(day_df["sent"], weights=w)
        emb_stack = np.average(np.vstack(day_df["emb"]), axis=0, weights=w)
        return pd.Series({"sent_mean": sent_w, "n": len(day_df), "emb": emb_stack})
    out = df.groupby("date").apply(agg_day)
    return out

fin_day = aggregate_block(news[news.src=="fin"].copy())
geo_day = aggregate_block(news[news.src=="geo"].copy())

print(f"fin_day rows {len(fin_day)}, geo_day {len(geo_day)}")

# =========================================
# 4. PCA‑64 PER BLOCK
# =========================================

def pca_stack(day_df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    emb_mat = np.vstack(day_df["emb"].values)
    pca = PCA(n_components=PCA_DIM, random_state=42).fit(emb_mat)
    comps = pca.transform(emb_mat)
    cols = [f"{prefix}p{i}" for i in range(PCA_DIM)]
    return pd.concat([day_df.drop(columns=["emb"]), pd.DataFrame(comps, index=day_df.index, columns=cols)], axis=1)

fin_day = pca_stack(fin_day, "fin_")
geo_day = pca_stack(geo_day, "geo_")

# =========================================
# 5. SHOCK / LAG / ROLL FEATURES PER BLOCK
# =========================================
for df, pref in [(fin_day,"fin_"),(geo_day,"geo_")]:
    roll = df["sent_mean"].rolling(30, min_periods=10)
    z = (df["sent_mean"] - roll.mean())/roll.std(ddof=0)
    df[f"{pref}shock"] = (z.abs()>1.8).astype(int)
    df[f"{pref}shock_mag"] = z.abs().fillna(0)
    for lag in [1,2,3]:
        df[f"{pref}sent_lag{lag}"] = df["sent_mean"].shift(lag)
        df[f"{pref}shock_lag{lag}"] = df[f"{pref}shock"].shift(lag)
    df[f"{pref}sent_roll7"] = df["sent_mean"].rolling(7).mean()


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  out = df.groupby("date").apply(agg_day)
  out = df.groupby("date").apply(agg_day)


fin_day rows 724, geo_day 2311


In [7]:
# =========================================
# 6. CANDLES & TECHNICALS
# =========================================

def candles(sec:str,start:str,end:str):
    url = f"https://iss.moex.com/iss/engines/stock/markets/shares/securities/{sec}/candles.json"
    pars = {"from":start, "till":end, "interval":24, "start":0}
    keep=["end","open","high","low","close","value"]
    frames=[]
    while True:
        js=requests.get(url,params=pars,timeout=20).json()["candles"]
        if not js["data"]: break
        cols=[c.lower() for c in js["columns"]]
        frames.append(pd.DataFrame(js["data"],columns=cols)[keep])
        pars["start"]+=len(frames[-1])
    df=pd.concat(frames,ignore_index=True)
    df.columns=["date","open","high","low","close","volume"]
    df["date"]=pd.to_datetime(df["date"]).dt.normalize()
    return df.set_index("date").sort_index()

START, END = "2014-01-01", "2025-05-21"
prices = candles("MOEX",START,END)
prices["ret1"] = prices.close.pct_change()
prices["sma5"] = prices.close.rolling(5).mean()
prices["sma20"] = prices.close.rolling(20).mean()
prices["vol_z"] = (prices.volume - prices.volume.rolling(20).mean())/prices.volume.rolling(20).std(ddof=0)
prices["atr14"] = (prices.high - prices.low).rolling(14).mean()/prices.close
prices = prices.dropna()


In [8]:
# =========================================
# 7. MERGE + SHIFT + TARGET
# =========================================
feat = prices.join(fin_day, how="left").join(geo_day, how="left", rsuffix="_r").fillna(0)
# shift all news-derived cols by +1 trading day
news_cols = [c for c in feat.columns if c.startswith("fin_") or c.startswith("geo_")]
feat[news_cols] = feat[news_cols].shift(1)

feat["vol_next"] = ((prices.high - prices.low)/prices.close).shift(-1)
thr = feat.vol_next.median()
feat["vol_cls"] = (feat.vol_next > thr).astype(int)

feat["weekday"] = feat.index.weekday
feat["month"] = feat.index.month
feat = feat.dropna()
print(f"dataset {feat.shape} — high‑σ share {feat.vol_cls.mean():.3f}")


dataset (2852, 164) — high‑σ share 0.500


  feat["vol_next"] = ((prices.high - prices.low)/prices.close).shift(-1)
  feat["vol_cls"] = (feat.vol_next > thr).astype(int)
  feat["weekday"] = feat.index.weekday
  feat["month"] = feat.index.month


In [9]:
# =========================================
# 8. CATBOOST CV
# =========================================
X = feat.drop(columns=["vol_next","vol_cls"])
y = feat.vol_cls
cat_idx = [i for i,c in enumerate(X.columns) if c in ("weekday","month")]

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import precision_recall_curve, classification_report
import numpy as np

cv = TimeSeriesSplit(5)
probas_all, y_all = [], []

for tr, te in cv.split(X):
    model = CatBoostClassifier(
        iterations=400, depth=7, learning_rate=0.05,
        colsample_bylevel=0.8, random_seed=42, verbose=0,
        loss_function="Logloss", auto_class_weights="Balanced")
    model.fit(X.iloc[tr], y.iloc[tr], cat_features=cat_idx)

    probas = model.predict_proba(X.iloc[te])[:, 1]   # P(class=high)
    probas_all.append(probas)
    y_all.append(y.iloc[te].values)


In [10]:

# ---------- 1. подбираем τ на валидационных фолдах ----------
probas_all = np.concatenate(probas_all)
y_all      = np.concatenate(y_all)

precision, recall, thr = precision_recall_curve(y_all, probas_all)

target_rec = 0.65                          # вашу цель можно менять
idx = np.where(recall >= target_rec)[0][-1]
tau = thr[idx]
print(f"Выбранный τ = {tau:.2f}  при recall ≥ {recall[idx]:.3f}")

# ---------- 2. применяем τ и считаем метрики ----------
y_pred = (probas_all >= tau).astype(int)
print(classification_report(y_all, y_pred, digits=3,
                            target_names=["low","high"]))


Выбранный τ = 0.33  при recall ≥ 0.650
              precision    recall  f1-score   support

         low      0.642     0.513     0.570      1305
        high      0.523     0.650     0.580      1070

    accuracy                          0.575      2375
   macro avg      0.582     0.582     0.575      2375
weighted avg      0.588     0.575     0.575      2375

