In [14]:
import os
import pandas as pd

# 定义文件夹路径
source_dir = "/home/jesse/Projects/Self_Learning/RL_Testing/Pool"
test_dir = "/home/jesse/Projects/Self_Learning/RL_Testing/Test"

# 获取所有 CSV 文件路径
csv_files = [f for f in os.listdir(source_dir) if f.endswith(".csv")]

# 查看前几个文件（调试用）
csv_files[:5]


['AMIX.csv', 'TOI.csv', 'BSVN.csv', 'COMM.csv', 'SATL.csv']

In [16]:
import os
import pandas as pd

# 定义路径
source_dir = "/home/jesse/Projects/Self_Learning/RL_Testing/Pool"
target_dir = "/home/jesse/Projects/Self_Learning/RL_Testing/Test"

# 设定时间范围
start_date = pd.to_datetime("2014-01-02")
end_date = pd.to_datetime("2024-12-09")

valid_tickers = []

# 获取所有 CSV 文件
csv_files = [f for f in os.listdir(source_dir) if f.endswith(".csv")]

for file in csv_files:
    file_path = os.path.join(source_dir, file)
    
    # 读取数据
    df = pd.read_csv(file_path)
    
    # 确保 `date` 列格式正确
    df["date"] = pd.to_datetime(df["date"])
    
    # 先按日期排序，方便填充缺失值
    df = df.sort_values(by="date").set_index("date")

    # 向前填充数据（如果某天缺失，则用上一天的数据填充）
    df = df.asfreq("B", method="ffill")  # "B" 代表交易日（Business Day），使用 ffill 向前填充

    # 确保包含 `2014-01-02` 和 `2024-12-09`
    if start_date in df.index and end_date in df.index:
        valid_tickers.append(file)

# 打印符合条件的股票数量
print(f"符合条件的股票数量: {len(valid_tickers)}")

# 将符合条件的股票文件复制到 `Test` 文件夹
import shutil

for file in valid_tickers:
    shutil.copy(os.path.join(source_dir, file), os.path.join(target_dir, file))

print(f"已复制 {len(valid_tickers)} 只股票到 {target_dir}")


符合条件的股票数量: 1377
已复制 1377 只股票到 /home/jesse/Projects/Self_Learning/RL_Testing/Test


In [17]:
import shutil

# 确保目标文件夹存在
os.makedirs(test_dir, exist_ok=True)

# 复制符合条件的文件
for file in valid_tickers:
    shutil.copy(os.path.join(source_dir, file), os.path.join(test_dir, file))

print(f"已复制 {len(valid_tickers)} 只符合条件的股票文件到 {test_dir}")


已复制 1377 只符合条件的股票文件到 /home/jesse/Projects/Self_Learning/RL_Testing/Test


In [18]:
# 创建存放 close 价格的 DataFrame
close_prices = pd.DataFrame()

for file in valid_tickers:
    file_path = os.path.join(test_dir, file)
    
    # 读取数据
    df = pd.read_csv(file_path)
    
    # 确保 `date` 列格式正确
    df["date"] = pd.to_datetime(df["date"])
    
    # 过滤时间范围
    df_filtered = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
    
    # 只保留 `date` 和 `close` 列
    df_filtered = df_filtered[["date", "close"]]
    
    # 重命名 `close` 列为 ticker（股票代码）
    ticker = file.replace(".csv", "")
    df_filtered = df_filtered.rename(columns={"close": ticker})
    
    # 合并数据
    if close_prices.empty:
        close_prices = df_filtered
    else:
        close_prices = close_prices.merge(df_filtered, on="date", how="outer")

# 按日期排序
close_prices = close_prices.sort_values(by="date")

# 保存合并后的 `close` 价格数据
output_file = "/home/jesse/Projects/Self_Learning/RL_Testing/close_prices.csv"
close_prices.to_csv(output_file, index=False)

print(f"合并后的 close 价格数据已保存至 {output_file}")


合并后的 close 价格数据已保存至 /home/jesse/Projects/Self_Learning/RL_Testing/close_prices.csv


In [19]:
# 读取合并后的 CSV 文件，查看前几行
df_result = pd.read_csv(output_file)
df_result.head()


Unnamed: 0,date,COMM,CPIX,QUBT,GILD,ARCB,AAPL,ANGO,INSG,SPOK,...,RCKY,GOOD,CHKP,APYX,SASR,STRR,LTRX,SFNC,NECB,VISL
0,2014-01-02,18.77,5.14,1.2,58.0685,31.1719,17.4159,17.34,23.1,8.84119,...,11.6472,9.03914,64.38,2.08,20.6606,26.2,1.579,14.5731,5.43,2246400.0
1,2014-01-03,19.19,5.08,1.2,57.3808,31.1719,17.032,17.33,23.7,8.92349,...,11.7269,9.13959,64.29,2.07,20.867,27.018,1.55,14.5965,5.29,2433600.0
2,2014-01-06,19.3,5.11,1.4,56.5479,30.9508,17.1258,17.11,23.6,8.88897,...,12.0893,9.14527,63.92,2.1,20.5117,26.051,1.6,14.5478,5.33,2332800.0
3,2014-01-07,19.03,5.12,1.6,56.1925,31.2356,17.0031,17.34,25.1,8.8642,...,12.1217,9.21823,63.94,2.04,20.7541,25.827,1.6,14.5965,5.42,2275200.0
4,2014-01-08,18.99,5.1,1.2,56.7174,30.7209,17.1118,17.4,24.1,8.76775,...,12.1779,9.18411,64.57,2.16,20.6966,27.241,1.67,14.6256,5.45,2232000.0
