In [1]:
import os
import time
from functools import reduce
import numpy as np
import pandas as pd

**在载入数据时，提前filter好'2017-06-30' 至 '2018-12-31'间的数据，如果filter后的数据中有个股数据“完全”缺失，则完全剔除该个股**

In [2]:

def load_data():
    # 读取excel及h5数据，所有数据提前query所需范围
    file_path1 = os.path.join(os.path.abspath("作业相关数据.xlsx"))
    file_path2 = os.path.join(os.path.abspath( "eodprices.h5"))
    # 周度交易日历
    weekly_trade_cal = pd.read_excel(file_path1, sheet_name="作业二_交易日历", usecols="A", skiprows=3, index_col=0).query("'2017-06-30' <= Date <= '2018-12-31'")
    weekly_trade_cal.index.name = "trading_day"
    # 指数数据
    index_data = pd.read_excel(file_path1, sheet_name="作业二_指数数据", skiprows=3, index_col=0, usecols="A, B, C").query("'2017-06-30' <= Date <= '2018-12-31'").copy()
    # 股票数据
    stock_data = pd.read_hdf(file_path2)
    stock_data.rename(columns={"S_INFO_WINDCODE": "symbol", "TRADE_DT": "trading_day", "S_DQ_ADJCLOSE": "close"}, inplace=True)
    # 利用pivot_table整合数据
    stock_data = stock_data.pivot_table(index="trading_day", columns="symbol", values="close")
    stock_data.index = pd.to_datetime(stock_data.index)
    stock_data = stock_data.query("'2017-06-30' <= trading_day <= '2018-12-31'").copy()
    # 如果'2017-06-30' 至 '2018-12-31'间，该个股数据完全缺失，则剔除
    stock_data.dropna(how="all", axis=1, inplace=True)

    return stock_data, index_data, weekly_trade_cal

**根据report_date进行filter，如果filter数据后某些个股数据存在任何缺失（未上市），则将其剔除**

**波动率计算时，假定一年有50周交易**

**动量计算时，假定一个月有21个交易日**

In [3]:
def calculation(stock_data, index_data, weekly_trade_cal, report_date):
    start_time = time.time()

    # 提取个股列表中往前一年的数据
    part_stock_data = stock_data.query(f"'{pd.Timestamp(report_date) - pd.DateOffset(years=1)}' <= trading_day <= '{report_date}'").copy()
    part_index_data = index_data.loc[part_stock_data.index]
    # 剔除提取后数据不完整的股票.
    part_stock_data.dropna(axis=1, inplace=True)
    part_stock_data_weekly = part_stock_data.merge(weekly_trade_cal, on='trading_day')
    # 假定一年有50周交易，计算std（波动率因子）
    stddev = (part_stock_data_weekly.pct_change().std() * np.sqrt(50)).to_frame("Stddev")
    stddev.index.name = "symbol"

    # 计算动量因子
    mom1 = part_stock_data.pct_change(1 * 21).iloc[-1].to_frame("Mom1")
    mom2 = part_stock_data.pct_change(3 * 21).iloc[-1].to_frame("Mom2")
    mom3 = part_stock_data.pct_change(6 * 21).iloc[-1].to_frame("Mom3")

    # 计算Beta
    beta = part_stock_data.pct_change().dropna().merge(part_index_data.pct_change().dropna(), on="trading_day").corr().loc[stddev.index, index_data.columns]
    beta.rename(columns={"000300.SH": "Beta1", "000905.SH": "Beta2"}, inplace=True)
    beta.index.name = "symbol"

    #将所有大类因子整合
    results = reduce(lambda x, y: pd.merge(x, y, on="symbol"), [stddev, mom1, mom2, mom3, beta])

    end_time = time.time()
    print(f"calculation time cost: {end_time - start_time:.2f}s")

    return results

In [4]:
if __name__ == '__main__':
    stock_data, index_data, weekly_trade_cal = load_data()
    
    results1 = calculation(stock_data, index_data, weekly_trade_cal, "2018-06-30")
    results2 = calculation(stock_data, index_data, weekly_trade_cal, "2018-12-31")
    
    #转成excel
    results1.to_excel("作业二20180630.xlsx")
    results2.to_excel("作业二20181231.xlsx")

calculation time cost: 4.59s
calculation time cost: 5.10s


**整体运行预计在15秒以内**