In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [27]:
candles = pd.read_csv('C:/Users/olafe/Downloads/forecast_data/forecast_data/candles.csv')
candles_2 = pd.read_csv('C:/Users/olafe/Downloads/forecast_data/forecast_data/candles_2.csv')
news = pd.read_csv('C:/Users/olafe/Downloads/forecast_data/forecast_data/news.csv')
news_2 = pd.read_csv('C:/Users/olafe/Downloads/forecast_data/forecast_data/news_2.csv')

In [28]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import pickle
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

FEATS = [
    'momentum_5', 'volatility_5', 'price_range', 'news_count'
]

def create_features(df):
    df = df.copy()
    df['begin'] = pd.to_datetime(df['begin'])
    df = df.sort_values(['ticker','begin']).reset_index(drop=True)
    df['momentum_5'] = df.groupby('ticker')['close'].pct_change(5)
    ret1 = df.groupby('ticker')['close'].pct_change()
    df['volatility_5'] = ret1.groupby(df['ticker']).rolling(5, min_periods=1).std().reset_index(level=0, drop=True)
    df['price_range'] = (df['high'] - df['low']) / df['close']
    for col in ['momentum_5','volatility_5','price_range']:
        df[col] = df[col].fillna(0.0)
    return df

def add_news_count(df, news):
    if news is None or len(news) == 0:
        df = df.copy()
        df['news_count'] = 0.0
        return df
    df = df.copy()
    news = news.copy()
    news['publish_date'] = pd.to_datetime(news['publish_date'])
    news['date'] = news['publish_date'].dt.normalize()
    daily = news.groupby('date').size().reset_index(name='news_count')
    df['date'] = df['begin'].dt.normalize()
    df = df.merge(daily, on='date', how='left')
    df['news_count'] = df['news_count'].fillna(0.0)
    df = df.drop(columns=['date'])
    return df

def create_targets(df, horizons=(1,20)):
    df = df.copy()
    for h in horizons:
        df[f'target_return_{h}d'] = df.groupby('ticker')['close'].pct_change(h).shift(-h)
    return df

def fit(train_candles, train_news=None, split_date='2024-09-08', model_path='model.pkl'):
    """Обучение модели"""
    print("[FIT] Подготовка данных...")

    candles = train_candles.copy()
    candles['begin'] = pd.to_datetime(candles['begin'])

    cutoff = pd.to_datetime(split_date)
    df = candles[candles['begin'] <= cutoff].copy()

    df = create_features(df)
    df = add_news_count(df, train_news)
    df = create_targets(df)

    mask = ~df[[f'target_return_{h}d' for h in (1,20)]].isna().any(axis=1)
    df_train = df.loc[mask].reset_index(drop=True)
    print(f"[FIT] Обучающих строк: {len(df_train)}")

    X = df_train[FEATS].values
    scaler = StandardScaler().fit(X)
    Xs = scaler.transform(X)

    models = {}
    for h in (1,20):
        y = df_train[f'target_return_{h}d'].values
        reg = LinearRegression().fit(Xs, y)
        models[f'reg_{h}'] = reg
        mae = float(np.mean(np.abs(y - reg.predict(Xs))))
        print(f"[FIT] Горизонт {h} дней MAE: {mae:.6f}")

    with open(model_path, 'wb') as f:
        pickle.dump({'features': FEATS, 'scaler': scaler, 'models': models}, f)
    print(f"[FIT] Модель сохранена: {model_path}")

def predict(test_candles, test_news=None, model_path='model.pkl', output_path='submission2.csv'):
    """Предсказание и создание сабмита"""
    print("[PREDICT] Создание сабмита...")

    candles = test_candles.copy()
    candles['begin'] = pd.to_datetime(candles['begin'])

    with open(model_path, 'rb') as f:
        payload = pickle.load(f)
    feats = payload['features']
    scaler = payload['scaler']
    models = payload['models']

    df = create_features(candles)
    df = add_news_count(df, test_news)

    # Берем последнюю дату по каждому тикеру
    last_idx = df.groupby('ticker')['begin'].idxmax()
    dfl = df.loc[last_idx].reset_index(drop=True)

    Xs = scaler.transform(dfl[feats].values)
    pr1  = models['reg_1'].predict(Xs)
    pr20 = models['reg_20'].predict(Xs)

    # Интерполяция для всех горизонтов
    alphas = np.linspace(0, 1, 20)[:, None]
    band = (1 - alphas) * pr1[None, :] + alphas * pr20[None, :]
    band = np.clip(band, -0.5, 0.5).T

    # Создаем сабмит в формате как в примере: только ticker и p1-p20
    submission = pd.DataFrame()
    submission['ticker'] = dfl['ticker']

    # Добавляем p1-p20 (только доходности)
    for i in range(20):
        submission[f'p{i+1}'] = np.round(band[:, i], 6)

    submission.to_csv(output_path, index=False)
    print(f"[PREDICT] Сабмит сохранен: {output_path}")
    print(f"Размер сабмита: {submission.shape}")
    print("\nПервые 5 строк:")
    print(submission.head())
    return submission

# ===== ЗАПУСК В НОУТБУКЕ =====
if __name__ == "__main__":
    # Автоматический запуск с твоими данными
    print("🚀 ЗАПУСК САБМИТА...")

    # Разделяем данные по дате
    split_date = '2024-09-08'

    candles['begin'] = pd.to_datetime(candles['begin'])
    candles_2['begin'] = pd.to_datetime(candles_2['begin'])

    # candles - тренировочные (до 8 сентября)
    train_candles = candles[candles['begin'] < split_date].copy()
    # candles_2 - тестовые (после 8 сентября)
    test_candles = candles_2[candles_2['begin'] >= split_date].copy()

    print(f"Train данных: {len(train_candles)} строк")
    print(f"Test данных: {len(test_candles)} строк")

    # Проверяем новости
    if 'news' in locals() and 'news_2' in locals():
        train_news = news
        test_news = news_2
        print("Новости подключены")
    else:
        train_news = None
        test_news = None
        print("Новости не используются")

    # 1. Обучаем модель
    print("\n" + "="*50)
    fit(train_candles, train_news, split_date=split_date)

    # 2. Создаем сабмит
    print("\n" + "="*50)
    submission = predict(test_candles, test_news)

    print("\n САБМИТ ГОТОВ!")
    print(f"  Колонки: {list(submission.columns)}")
    print(f"  Всего колонок: {len(submission.columns)}")
    print(f"  Тикеров: {len(submission['ticker'].unique())}")
    print(f"  Файл: submission2.csv")

🚀 ЗАПУСК САБМИТА...
Train данных: 20009 строк
Test данных: 1745 строк
Новости подключены

[FIT] Подготовка данных...
[FIT] Обучающих строк: 19629
[FIT] Горизонт 1 дней MAE: 0.014303
[FIT] Горизонт 20 дней MAE: 0.073874
[FIT] Модель сохранена: model.pkl

[PREDICT] Создание сабмита...
[PREDICT] Сабмит сохранен: submission3.csv
Размер сабмита: (19, 21)

Первые 5 строк:
  ticker        p1        p2        p3        p4        p5        p6  \
0   AFLT  0.002135  0.003174  0.004212  0.005251  0.006289  0.007327   
1   ALRS  0.002009  0.003081  0.004152  0.005223  0.006295  0.007366   
2   CHMF  0.001793  0.002912  0.004031  0.005149  0.006268  0.007387   
3   GAZP  0.001678  0.002761  0.003844  0.004927  0.006010  0.007093   
4   GMKN  0.002430  0.003413  0.004395  0.005378  0.006360  0.007342   

         p7        p8        p9  ...       p11       p12       p13       p14  \
0  0.008366  0.009404  0.010443  ...  0.012520  0.013558  0.014597  0.015635   
1  0.008438  0.009509  0.010580  ...  