In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path

In [4]:
import sys
sys.path.append("/dsmlp/home-fs04/19/019/riling/scalable_rl_portfolio_management")


In [5]:
from tics.tic_config import tics_176, tics_grouped
from agent.data_downloader import short_name_sha256

In [6]:
tics = tics_176
train_start_date= '2009-01-01'
train_end_date= '2020-07-01'
test_start_date= '2020-07-01'
test_end_date= '2021-10-01'

In [7]:
start = train_start_date
end = test_end_date

In [8]:
def compute_weighted_df_simple(tics_df: pd.DataFrame, weights_df: pd.DataFrame, name: str) -> pd.DataFrame:
    """
    1. 解析空格分隔的权重字符串（如 "[0.1 0.2 0.3]"），去掉第一个元素；
    2. 跳过 weights_df 的最后一行；
    3. 对剩余权重重新归一化（sum=1）；
    4. 按 tic 排序并计算加权平均，结果 'tic' 列统一为传入的 name。
    """
    records = []
    
    def parse_space_weights(s: str) -> np.ndarray:
        parts = s.strip('[]').split()
        arr = np.array([float(x) for x in parts])
        return arr[1:]
    
    # 解析并去掉首元素
    weights_df = weights_df.copy()
    weights_df['w_arr'] = weights_df['weights'].apply(parse_space_weights)
    
    # 跳过最后一行
    for _, row in weights_df.iloc[:-1].iterrows():
        date = row['dates']
        w = row['w_arr']
        
        # 归一化
        w = w / w.sum()
        
        df_day = tics_df[tics_df['date'] == date].sort_values('tic').reset_index(drop=True)
        if len(w) != len(df_day):
            raise ValueError(f"{date}: 权重长度 {len(w)} != 股票数量 {len(df_day)}")
        
        rec = {
            'date': date,
            'close':  np.dot(df_day['close'],   w),
            'high':   np.dot(df_day['high'],    w),
            'low':    np.dot(df_day['low'],     w),
            'open':   np.dot(df_day['open'],    w),
            'volume': np.dot(df_day['volume'],  w),
            'tic':    name,
            'day':    df_day['day'].iloc[0]
        }
        records.append(rec)
    
    return pd.DataFrame(records)


In [9]:
# tics_data_file = Path("data") / 'sub' / f"{short_name_sha256('_'.join(tics_grouped[0]))}_{start}_{end}.csv"
# weights_data_file = Path("weights") / f"{short_name_sha256('_'.join(tics_grouped[0]))}_{start}_{end}.csv"

In [10]:
# tics_df = pd.read_csv(tics_data_file) 
# weights_df = pd.read_csv(weights_data_file)

In [11]:
# tics_df.head()

In [21]:
# weights_df.head()

In [22]:
# df = compute_weighted_df_simple(tics_df, weights_df)

In [23]:
dfs = []
tics = []
for i in range(6):
    tics += tics_grouped[i]
    name = short_name_sha256('_'.join(tics_grouped[i]))
    tics_data_file = Path("data") / 'sub' / f"{name}_{start}_{end}.csv"
    weights_data_file = Path("weights") / f"{name}_{start}_{end}.csv"
    tics_df = pd.read_csv(tics_data_file) 
    weights_df = pd.read_csv(weights_data_file)
    dfs.append(compute_weighted_df_simple(tics_df, weights_df, name))

In [24]:
tics = []
for group in tics_grouped:
    tics.append(short_name_sha256('_'.join(tics)))
tics

['e3b0c44298fc1c14',
 '8e2d0c61b0acc423',
 '56b6ea2e42d34e3e',
 '1e11175f3bd186fb',
 'd725558425436f33',
 '51e648daa4892612']

In [25]:
pd.concat(dfs).to_csv(Path("data") / "weighted_avg" / f"{short_name_sha256('_'.join(tics))}_{start}_{end}.csv", index=False)

In [27]:
df = pd.read_csv(Path("data") / "weighted_avg" / f"{short_name_sha256('_'.join(tics))}_{start}_{end}.csv")

In [31]:
df.groupby('date').count().max().max()

6

In [30]:
df.sort_values('date').head()

Unnamed: 0,date,close,high,low,open,volume,tic,day
0,2009-03-17,17.285871,25.062188,23.804299,24.062443,8016490.0,8190e4275b4db67d,1
6318,2009-03-17,15.297194,20.241047,19.431726,19.692479,15129140.0,8d1de7fe38361930,1
12636,2009-03-17,15.60538,20.138504,19.313036,19.558396,97627340.0,ab78604f0f30eea2,1
9477,2009-03-17,14.238356,18.192834,17.25594,17.414609,7752128.0,4fd6d6fada6e8ad6,1
3159,2009-03-17,8.595474,9.79699,9.342103,9.578043,2426194.0,6439ac4bce64a477,1


In [12]:
short_name_sha256('_'.join(tics_grouped[5]))

'15694139a6dae4a8'