In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [27]:
candles = pd.read_csv('C:/Users/olafe/Downloads/forecast_data/forecast_data/candles.csv')
candles_2 = pd.read_csv('C:/Users/olafe/Downloads/forecast_data/forecast_data/candles_2.csv')
news = pd.read_csv('C:/Users/olafe/Downloads/forecast_data/forecast_data/news.csv')
news_2 = pd.read_csv('C:/Users/olafe/Downloads/forecast_data/forecast_data/news_2.csv')

In [28]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import pickle
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

FEATS = [
    'momentum_5', 'volatility_5', 'price_range', 'news_count'
]

def create_features(df):
    df = df.copy()
    df['begin'] = pd.to_datetime(df['begin'])
    df = df.sort_values(['ticker','begin']).reset_index(drop=True)
    df['momentum_5'] = df.groupby('ticker')['close'].pct_change(5)
    ret1 = df.groupby('ticker')['close'].pct_change()
    df['volatility_5'] = ret1.groupby(df['ticker']).rolling(5, min_periods=1).std().reset_index(level=0, drop=True)
    df['price_range'] = (df['high'] - df['low']) / df['close']
    for col in ['momentum_5','volatility_5','price_range']:
        df[col] = df[col].fillna(0.0)
    return df

def add_news_count(df, news):
    if news is None or len(news) == 0:
        df = df.copy()
        df['news_count'] = 0.0
        return df
    df = df.copy()
    news = news.copy()
    news['publish_date'] = pd.to_datetime(news['publish_date'])
    news['date'] = news['publish_date'].dt.normalize()
    daily = news.groupby('date').size().reset_index(name='news_count')
    df['date'] = df['begin'].dt.normalize()
    df = df.merge(daily, on='date', how='left')
    df['news_count'] = df['news_count'].fillna(0.0)
    df = df.drop(columns=['date'])
    return df

def create_targets(df, horizons=(1,20)):
    df = df.copy()
    for h in horizons:
        df[f'target_return_{h}d'] = df.groupby('ticker')['close'].pct_change(h).shift(-h)
    return df

def fit(train_candles, train_news=None, split_date='2024-09-08', model_path='model.pkl'):
    """–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏"""
    print("[FIT] –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö...")

    candles = train_candles.copy()
    candles['begin'] = pd.to_datetime(candles['begin'])

    cutoff = pd.to_datetime(split_date)
    df = candles[candles['begin'] <= cutoff].copy()

    df = create_features(df)
    df = add_news_count(df, train_news)
    df = create_targets(df)

    mask = ~df[[f'target_return_{h}d' for h in (1,20)]].isna().any(axis=1)
    df_train = df.loc[mask].reset_index(drop=True)
    print(f"[FIT] –û–±—É—á–∞—é—â–∏—Ö —Å—Ç—Ä–æ–∫: {len(df_train)}")

    X = df_train[FEATS].values
    scaler = StandardScaler().fit(X)
    Xs = scaler.transform(X)

    models = {}
    for h in (1,20):
        y = df_train[f'target_return_{h}d'].values
        reg = LinearRegression().fit(Xs, y)
        models[f'reg_{h}'] = reg
        mae = float(np.mean(np.abs(y - reg.predict(Xs))))
        print(f"[FIT] –ì–æ—Ä–∏–∑–æ–Ω—Ç {h} –¥–Ω–µ–π MAE: {mae:.6f}")

    with open(model_path, 'wb') as f:
        pickle.dump({'features': FEATS, 'scaler': scaler, 'models': models}, f)
    print(f"[FIT] –ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞: {model_path}")

def predict(test_candles, test_news=None, model_path='model.pkl', output_path='submission2.csv'):
    """–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –∏ —Å–æ–∑–¥–∞–Ω–∏–µ —Å–∞–±–º–∏—Ç–∞"""
    print("[PREDICT] –°–æ–∑–¥–∞–Ω–∏–µ —Å–∞–±–º–∏—Ç–∞...")

    candles = test_candles.copy()
    candles['begin'] = pd.to_datetime(candles['begin'])

    with open(model_path, 'rb') as f:
        payload = pickle.load(f)
    feats = payload['features']
    scaler = payload['scaler']
    models = payload['models']

    df = create_features(candles)
    df = add_news_count(df, test_news)

    # –ë–µ—Ä–µ–º –ø–æ—Å–ª–µ–¥–Ω—é—é –¥–∞—Ç—É –ø–æ –∫–∞–∂–¥–æ–º—É —Ç–∏–∫–µ—Ä—É
    last_idx = df.groupby('ticker')['begin'].idxmax()
    dfl = df.loc[last_idx].reset_index(drop=True)

    Xs = scaler.transform(dfl[feats].values)
    pr1  = models['reg_1'].predict(Xs)
    pr20 = models['reg_20'].predict(Xs)

    # –ò–Ω—Ç–µ—Ä–ø–æ–ª—è—Ü–∏—è –¥–ª—è –≤—Å–µ—Ö –≥–æ—Ä–∏–∑–æ–Ω—Ç–æ–≤
    alphas = np.linspace(0, 1, 20)[:, None]
    band = (1 - alphas) * pr1[None, :] + alphas * pr20[None, :]
    band = np.clip(band, -0.5, 0.5).T

    # –°–æ–∑–¥–∞–µ–º —Å–∞–±–º–∏—Ç –≤ —Ñ–æ—Ä–º–∞—Ç–µ –∫–∞–∫ –≤ –ø—Ä–∏–º–µ—Ä–µ: —Ç–æ–ª—å–∫–æ ticker –∏ p1-p20
    submission = pd.DataFrame()
    submission['ticker'] = dfl['ticker']

    # –î–æ–±–∞–≤–ª—è–µ–º p1-p20 (—Ç–æ–ª—å–∫–æ –¥–æ—Ö–æ–¥–Ω–æ—Å—Ç–∏)
    for i in range(20):
        submission[f'p{i+1}'] = np.round(band[:, i], 6)

    submission.to_csv(output_path, index=False)
    print(f"[PREDICT] –°–∞–±–º–∏—Ç —Å–æ—Ö—Ä–∞–Ω–µ–Ω: {output_path}")
    print(f"–†–∞–∑–º–µ—Ä —Å–∞–±–º–∏—Ç–∞: {submission.shape}")
    print("\n–ü–µ—Ä–≤—ã–µ 5 —Å—Ç—Ä–æ–∫:")
    print(submission.head())
    return submission

# ===== –ó–ê–ü–£–°–ö –í –ù–û–£–¢–ë–£–ö–ï =====
if __name__ == "__main__":
    # –ê–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏–π –∑–∞–ø—É—Å–∫ —Å —Ç–≤–æ–∏–º–∏ –¥–∞–Ω–Ω—ã–º–∏
    print("üöÄ –ó–ê–ü–£–°–ö –°–ê–ë–ú–ò–¢–ê...")

    # –†–∞–∑–¥–µ–ª—è–µ–º –¥–∞–Ω–Ω—ã–µ –ø–æ –¥–∞—Ç–µ
    split_date = '2024-09-08'

    candles['begin'] = pd.to_datetime(candles['begin'])
    candles_2['begin'] = pd.to_datetime(candles_2['begin'])

    # candles - —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã–µ (–¥–æ 8 —Å–µ–Ω—Ç—è–±—Ä—è)
    train_candles = candles[candles['begin'] < split_date].copy()
    # candles_2 - —Ç–µ—Å—Ç–æ–≤—ã–µ (–ø–æ—Å–ª–µ 8 —Å–µ–Ω—Ç—è–±—Ä—è)
    test_candles = candles_2[candles_2['begin'] >= split_date].copy()

    print(f"Train –¥–∞–Ω–Ω—ã—Ö: {len(train_candles)} —Å—Ç—Ä–æ–∫")
    print(f"Test –¥–∞–Ω–Ω—ã—Ö: {len(test_candles)} —Å—Ç—Ä–æ–∫")

    # –ü—Ä–æ–≤–µ—Ä—è–µ–º –Ω–æ–≤–æ—Å—Ç–∏
    if 'news' in locals() and 'news_2' in locals():
        train_news = news
        test_news = news_2
        print("–ù–æ–≤–æ—Å—Ç–∏ –ø–æ–¥–∫–ª—é—á–µ–Ω—ã")
    else:
        train_news = None
        test_news = None
        print("–ù–æ–≤–æ—Å—Ç–∏ –Ω–µ –∏—Å–ø–æ–ª—å–∑—É—é—Ç—Å—è")

    # 1. –û–±—É—á–∞–µ–º –º–æ–¥–µ–ª—å
    print("\n" + "="*50)
    fit(train_candles, train_news, split_date=split_date)

    # 2. –°–æ–∑–¥–∞–µ–º —Å–∞–±–º–∏—Ç
    print("\n" + "="*50)
    submission = predict(test_candles, test_news)

    print("\n –°–ê–ë–ú–ò–¢ –ì–û–¢–û–í!")
    print(f"  –ö–æ–ª–æ–Ω–∫–∏: {list(submission.columns)}")
    print(f"  –í—Å–µ–≥–æ –∫–æ–ª–æ–Ω–æ–∫: {len(submission.columns)}")
    print(f"  –¢–∏–∫–µ—Ä–æ–≤: {len(submission['ticker'].unique())}")
    print(f"  –§–∞–π–ª: submission2.csv")

üöÄ –ó–ê–ü–£–°–ö –°–ê–ë–ú–ò–¢–ê...
Train –¥–∞–Ω–Ω—ã—Ö: 20009 —Å—Ç—Ä–æ–∫
Test –¥–∞–Ω–Ω—ã—Ö: 1745 —Å—Ç—Ä–æ–∫
–ù–æ–≤–æ—Å—Ç–∏ –ø–æ–¥–∫–ª—é—á–µ–Ω—ã

[FIT] –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö...
[FIT] –û–±—É—á–∞—é—â–∏—Ö —Å—Ç—Ä–æ–∫: 19629
[FIT] –ì–æ—Ä–∏–∑–æ–Ω—Ç 1 –¥–Ω–µ–π MAE: 0.014303
[FIT] –ì–æ—Ä–∏–∑–æ–Ω—Ç 20 –¥–Ω–µ–π MAE: 0.073874
[FIT] –ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞: model.pkl

[PREDICT] –°–æ–∑–¥–∞–Ω–∏–µ —Å–∞–±–º–∏—Ç–∞...
[PREDICT] –°–∞–±–º–∏—Ç —Å–æ—Ö—Ä–∞–Ω–µ–Ω: submission3.csv
–†–∞–∑–º–µ—Ä —Å–∞–±–º–∏—Ç–∞: (19, 21)

–ü–µ—Ä–≤—ã–µ 5 —Å—Ç—Ä–æ–∫:
  ticker        p1        p2        p3        p4        p5        p6  \
0   AFLT  0.002135  0.003174  0.004212  0.005251  0.006289  0.007327   
1   ALRS  0.002009  0.003081  0.004152  0.005223  0.006295  0.007366   
2   CHMF  0.001793  0.002912  0.004031  0.005149  0.006268  0.007387   
3   GAZP  0.001678  0.002761  0.003844  0.004927  0.006010  0.007093   
4   GMKN  0.002430  0.003413  0.004395  0.005378  0.006360  0.007342   

         p7        p8