Загрузка данных

In [7]:
from alpaca.data.timeframe import TimeFrame
from alpaca.data.historical import CryptoHistoricalDataClient
from alpaca.data.requests import CryptoBarsRequest
import datetime

API_KEY = "PK1W6UGQYV6DJYD9HEP7"
SECRET_KEY = "ndH61mpUAGkK7jNKM8fgl1EvjUQKf0yDofyYP0pe"

client = CryptoHistoricalDataClient()
request_params = CryptoBarsRequest(
  symbol_or_symbols=["BTC/USD", 'ETH/USD', 'USDT/USD', 'XRP/USD', 'SOL/USD', 'USDC/USD', 'DOGE/USD', 'BCH/USD', 'LINK/USD', 'AVAX/USD'],
  timeframe=TimeFrame.Day,
  start=datetime.datetime(2020, 7, 1),
  end=datetime.datetime(2025, 7, 1)
)

btc_bars = client.get_crypto_bars(request_params)

df = btc_bars.df

Добавление меток о сезоне и дне недели

In [8]:
import pandas as pd

ts = df.index.get_level_values('timestamp')
ts = pd.to_datetime(ts)

dow_dummies = pd.get_dummies(ts.day_name(), prefix='is', dtype=bool).reindex(
    columns=['is_Monday', 'is_Tuesday', 'is_Wednesday', 'is_Thursday',
             'is_Friday', 'is_Saturday', 'is_Sunday'], fill_value=False)

def season(month):
    if month in (12, 1, 2):
        return 'winter'
    elif month in (3, 4, 5):
        return 'spring'
    elif month in (6, 7, 8):
        return 'summer'
    else:
        return 'autumn'

season_dummies = pd.get_dummies([season(m) for m in ts.month], prefix='is', dtype=bool).reindex(
    columns=['is_winter', 'is_spring', 'is_summer', 'is_autumn'], fill_value=False)

dow_dummies.index = df.index
season_dummies.index = df.index

df = df.join(dow_dummies).join(season_dummies)

Добавление индекса страха и жадности

In [9]:
import requests

def get_fear_greed_data():
    url = "https://api.alternative.me/fng/?limit=0&format=json"
    response = requests.get(url)
    data = response.json()['data']
    df = pd.DataFrame(data)
    df['timestamp'] = pd.to_datetime(df['timestamp'].astype(int), unit='s').dt.date
    df['fear&greed'] = df['value'].astype(int)
    return df[['timestamp', 'fear&greed']]

fng_df = get_fear_greed_data()

df = df.reset_index()
df['date'] = df['timestamp'].dt.date

df = df.merge(fng_df, how='left', left_on='date', right_on='timestamp')

df = df.drop(columns=['timestamp_y', 'date'])
df = df.rename(columns={'timestamp_x': 'timestamp'})

df = df.set_index(['symbol', 'timestamp'])

Подготовка к предобработке

In [10]:
import numpy as np

df = df.sort_index(level=["symbol", "timestamp"])

TRAIN_END = pd.Timestamp("2024-12-31", tz="UTC")
VAL_END   = pd.Timestamp("2025-04-30", tz="UTC")

FEATURE_COLS = [
    "open", "high", "low", "close", "volume",
    "trade_count", "vwap", "fear&greed"
]

Заполнение пропуска

In [11]:
date_to_fix = pd.Timestamp('2024-10-26', tz='UTC')
ts_index = df.index.get_level_values('timestamp').normalize()
mask_date = ts_index == date_to_fix
df.loc[mask_date, 'fear&greed'] = df.loc[mask_date, 'fear&greed'].fillna(50)
df.isna().any().any()


False

Заполнение нулей скользящим средним

In [14]:
num_cols = df.select_dtypes(include=['number']).columns
zero_mask = (df[num_cols] == 0).any(axis=1)
df_zeros = df[zero_mask]
df_zeros

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume,trade_count,vwap,is_Monday,is_Tuesday,is_Wednesday,is_Thursday,is_Friday,is_Saturday,is_Sunday,is_winter,is_spring,is_summer,is_autumn,fear&greed
symbol,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1


In [13]:
for col in ['volume', 'trade_count', 'vwap']:
    df[col] = df[col].replace(0, np.nan)
    df[col] = df.groupby(level='symbol')[col].transform(
        lambda x: x.fillna(x.rolling(window=3, min_periods=1, center=True).mean())
    )


Обрабатываем выбросы логарифмированием

In [15]:
for col in FEATURE_COLS:
    df[col] = np.log(df[col])

Делим выборку и масштабируем количественные признаки

In [17]:
from sklearn.preprocessing import StandardScaler

idx_ts = df.index.get_level_values("timestamp")

train = df[idx_ts <= TRAIN_END]
val   = df[(idx_ts > TRAIN_END) & (idx_ts <= VAL_END)]
test  = df[idx_ts > VAL_END]

feature_scalers = {}
train_scaled_list = []
val_scaled_list   = []
test_scaled_list  = []

for sym, g_train in train.groupby(level="symbol"):
    sc_feat = StandardScaler()
    sc_feat.fit(g_train[FEATURE_COLS])
    feature_scalers[sym] = sc_feat

    def scale_split(split_df):
        g = split_df.xs(sym, level="symbol", drop_level=False).copy()
        g[FEATURE_COLS] = sc_feat.transform(g[FEATURE_COLS])
        return g

    train_scaled_list.append(scale_split(train))
    val_scaled_list.append(scale_split(val))
    test_scaled_list.append(scale_split(test))

train_scaled = pd.concat(train_scaled_list).sort_index()
val_scaled   = pd.concat(val_scaled_list).sort_index()
test_scaled  = pd.concat(test_scaled_list).sort_index()
