In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path

In [4]:
import sys
sys.path.append("/dsmlp/home-fs04/19/019/riling/scalable_rl_portfolio_management")


In [5]:
from tics.tic_config import tics_176, tics_grouped
from agent.data_downloader import short_name_sha256

In [6]:
tics = tics_176
train_start_date= '2009-01-01'
train_end_date= '2020-07-01'
test_start_date= '2020-07-01'
test_end_date= '2021-10-01'

In [7]:
start = train_start_date
end = test_end_date

In [7]:
def compute_portfolio_ohlcv_with_values(
    tics_df: pd.DataFrame,
    weights_df: pd.DataFrame,
    value_df: pd.DataFrame,
    name: str
) -> pd.DataFrame:
    """
    计算组合 OHLCV：
      - 跳过首日
      - open 使用前一日收盘市值（cap_prev）
      - 持仓股数 = 当日开盘可投资资本 / 当日开盘价
      - high = 现金 + Σ(shares * 当日high)
      - low  = 现金 + Σ(shares * 当日low)
      - close = 当日收盘市值（account_value）
      - volume = Σ(shares * 当日volume)
    """
    # 重命名并转换日期列
    weights_df = weights_df.rename(columns={'dates':'date'})
    for df in (tics_df, weights_df, value_df):
        df['date'] = pd.to_datetime(df['date'])

    # 准备账户价值和前一日资本
    val = (
        value_df[['date','account_value']]
        .drop_duplicates()
        .sort_values('date')
        .set_index('date')
    )
    val['cap_prev'] = val['account_value'].shift(1)

    # 解析权重数组
    def parse_weights(s: str):
        return np.array([float(x) for x in s.strip('[]').split()])

    wdf = (
        weights_df
        .assign(w_arr=lambda d: d['weights'].apply(parse_weights))
        .sort_values('date')
        .reset_index(drop=True)
    )

    records = []
    # 遍历日期，跳过首日和末日
    for i in range(1, len(wdf)-1):
        date = wdf.at[i, 'date']
        cap_prev = val.at[date, 'cap_prev']
        if pd.isna(cap_prev):
            continue

        # 使用前一日现金权重计算现金资本
        prev_w_all = wdf.at[i-1, 'w_arr']
        prev_cash = prev_w_all[0]
        cash_cap = cap_prev * prev_cash
        stock_cap = cap_prev - cash_cap

        # 当日行情，按ticker排序
        day = (
            tics_df[tics_df['date']==date]
            .sort_values('tic')
            .reset_index(drop=True)
        )
        opens  = day['open'].values
        highs  = day['high'].values
        lows   = day['low'].values
        vols   = day['volume'].values

        # 计算持仓股数：全部股票资本按当日开盘執行
        weights_stock = prev_w_all[1:]
        shares = (weights_stock * stock_cap) / opens

        # 计算 OHLCV
        open_port   = cap_prev
        high_port   = cash_cap + shares.dot(highs)
        low_port    = cash_cap + shares.dot(lows)
        close_port  = val.at[date, 'account_value']
        volume_port = shares.dot(vols)

        records.append({
            'date':   date,
            'open':   open_port,
            'high':   high_port,
            'low':    low_port,
            'close':  close_port,
            'volume': volume_port,
            'tic':    name,
            'day':    day['day'].iloc[0]
        })

    return pd.DataFrame(records)

In [8]:
# tics_data_file = Path("data") / 'sub' / f"{short_name_sha256('_'.join(tics_grouped[0]))}_{start}_{end}.csv"
# weights_data_file = Path("weights") / f"{short_name_sha256('_'.join(tics_grouped[0]))}_{start}_{end}.csv"

In [9]:
# tics_df = pd.read_csv(tics_data_file) 
# weights_df = pd.read_csv(weights_data_file)

In [10]:
# tics_df.head()

In [11]:
# weights_df.head()

In [12]:
# df = compute_weighted_df_simple(tics_df, weights_df)

In [13]:
dfs = []
tics = []
for i in range(6):
    tics += tics_grouped[i]
    name = short_name_sha256('_'.join(tics_grouped[i]))
    tics_data_file = Path("data") / 'sub' / f"{name}_{start}_{end}.csv"
    weights_data_file = Path("weights") / f"{name}_{start}_{end}.csv"
    vaulue_data_file = Path("acount_value") / f"{name}_{start}_{end}.csv"
    tics_df = pd.read_csv(tics_data_file).drop_duplicates()
    weights_df = pd.read_csv(weights_data_file).drop_duplicates()
    value_df = pd.read_csv(vaulue_data_file).drop_duplicates()
    dfs.append(compute_portfolio_ohlcv_with_values(tics_df, weights_df, value_df, name))

In [10]:
tics = []
for group in tics_grouped:
    tics.append(short_name_sha256('_'.join(group)))
tics

['8190e4275b4db67d',
 '6439ac4bce64a477',
 '8d1de7fe38361930',
 '4fd6d6fada6e8ad6',
 'ab78604f0f30eea2',
 '15694139a6dae4a8']

In [11]:
short_name_sha256('_'.join(tics))

'a677ab9f104974c6'

In [12]:
pd.concat(dfs).to_csv(Path("data") / "weighted_avg" / f"{short_name_sha256('_'.join(tics))}_{start}_{end}.csv", index=False)

NameError: name 'dfs' is not defined

In [None]:
df = pd.read_csv(Path("data") / "weighted_avg" / f"{short_name_sha256('_'.join(tics))}_{start}_{end}.csv")
df.sort_values(['date', 'tic']).head(6)

Unnamed: 0,date,open,high,low,close,volume,tic,day
15790,2009-03-18,1055987.6,1091739.0,1007241.0,1084510.6,469848100000.0,15694139a6dae4a8,2
9474,2009-03-18,1045677.9,1059972.0,993802.5,1072509.8,735394800000.0,4fd6d6fada6e8ad6,2
3158,2009-03-18,1009651.94,1120186.0,956173.8,1094467.9,6891846000000.0,6439ac4bce64a477,2
0,2009-03-18,1037264.4,1043336.0,980812.9,1056196.0,727901700000.0,8190e4275b4db67d,2
6316,2009-03-18,1032019.44,1027429.0,976994.4,1042918.5,2336070000000.0,8d1de7fe38361930,2
12632,2009-03-18,1036860.25,1028091.0,976349.7,1060278.5,292477100000000.0,ab78604f0f30eea2,2
