In [22]:
import sys
import os
from mfm.MFM import MFM
import pandas as pd 
import numpy as np
from pymongo import MongoClient

MONGO_CONNECTION_STRING = "mongodb://localhost:27017/"
DB_NAME = "barra_financial_data"

client = MongoClient(MONGO_CONNECTION_STRING)
db = client[DB_NAME] # <--- 获取数据库连接对象

def load_collection_to_df(db, collection_name: str, query: dict, projection: dict) -> pd.DataFrame:
    """加载经过筛选和投影的集合数据。"""
    print(f"正在从 '{collection_name}' 加载数据...")
    collection = db[collection_name]
    cursor = collection.find(query, projection)
    df = pd.DataFrame(list(cursor))
    print(f"-> 成功加载 {len(df):,} 行数据。")
    return df

In [None]:
####导入数据

data = load_collection_to_df(db, collection_name='barra_factors', query={}, projection={'_id':0})
naidx = 1*np.sum(pd.isna(data), axis = 1)>0
data = data[~naidx]
data.index = range(len(data))


####行业数据
industry_info = load_collection_to_df(db, collection_name='sw_industry_info_for_factors', query={}, projection={'_id':0})
industry = np.array([1*(data.industry.values == x) for x in industry_info.code.values]).T
industry = pd.DataFrame(industry, columns = list(industry_info.industry_names.values))
data = pd.concat([data.iloc[:,:4], industry, data.iloc[:,5:]], axis = 1)


model = MFM(data, 21, 10)
(factor_ret, specific_ret, R2) = model.reg_by_time()
nw_cov_ls = model.Newey_West_by_time(q = 2, tao = 252)                 #Newey_West调整
er_cov_ls = model.eigen_risk_adj_by_time(M = 100, scale_coef = 1.4)    #特征风险调整
vr_cov_ls, lamb = model.vol_regime_adj_by_time(tao = 42) 

正在从 'barra_factors' 加载数据...
-> 成功加载 68,385 行数据。
正在从 'sw_industry_info_for_factors' 加载数据...
-> 成功加载 21 行数据。
Cross Section Regression, Date: 2021/01/20, 46 Stocks, 21 Industry Facotrs, 10 Style Facotrs

  factors_tran = factors @ R
  factors_tran = factors @ R
  factors_tran = factors @ R
  pure_factor_portfolio_weight = R @ np.linalg.pinv(factors_tran.T @ W @ factors_tran) @ factors_tran.T @ W  #纯因子组合权重
  pure_factor_portfolio_weight = R @ np.linalg.pinv(factors_tran.T @ W @ factors_tran) @ factors_tran.T @ W  #纯因子组合权重
  pure_factor_portfolio_weight = R @ np.linalg.pinv(factors_tran.T @ W @ factors_tran) @ factors_tran.T @ W  #纯因子组合权重
  return _core_matmul(x1, x2)
  return _core_matmul(x1, x2)
  return _core_matmul(x1, x2)
  factor_ret = pure_factor_portfolio_weight @ self.ret                        #纯因子收益
  factor_ret = pure_factor_portfolio_weight @ self.ret                        #纯因子收益
  factor_ret = pure_factor_portfolio_weight @ self.ret                        #纯因子收益
  pure_factor_portfolio_exposure = pure_factor_portfolio_weight @ factors     #纯因子组合在各个因子上的暴露
  pure_factor_portfolio_exposure = pure_factor_portfolio_weight @ factors     #纯因子组合在各个因子上的暴露
  pure_factor_portfolio_ex

Cross Section Regression, Date: 2025/10/21, 50 Stocks, 21 Industry Facotrs, 10 Style Facotrs


[=====                                             ] 11.26%   date: 2020-11-20

In [15]:
factor_ret

Unnamed: 0,country,石油石化,非银金融,机械设备,银行,房地产,通信,国防军工,医药生物,基础化工,...,size,beta,momentum,residual_volatility,non_linear_size,book_to_price_ratio,liquidity,earnings_yield,growth,leverage
2020-04-09,-0.000492,0.021185,0.004607,0.001525,-0.013276,-0.035050,0.001405,0.049694,0.011286,0.000657,...,6.521899,3.664725,-535.766434,-225.933456,0.036229,-14.965402,-1.280614,230.784289,-2.966364,0.000838
2020-04-10,-0.001265,0.034765,-0.008483,0.015074,-0.002494,0.016342,0.011498,0.024517,0.003446,-0.007701,...,4.655482,10.369161,1426.005614,-204.418991,0.021483,-19.391564,-6.979532,125.739208,-6.817489,-0.000182
2020-04-13,0.012485,-0.001474,-0.007876,-0.022671,0.005888,0.037682,-0.021839,-0.026694,-0.010430,0.008465,...,-0.962837,1.678472,-1494.834955,-230.927130,-0.004001,0.746510,4.050928,-81.507327,3.806116,-0.002427
2020-04-14,-0.005137,-0.003614,0.002369,-0.005601,-0.001631,-0.005944,-0.008283,0.006039,-0.000967,-0.000054,...,1.159778,-6.904150,-39.246956,-174.160223,-0.026624,4.781551,0.460942,-10.710761,3.680957,-0.000283
2020-04-15,0.000572,-0.002624,0.000145,-0.003500,-0.002872,0.044539,-0.021464,0.026645,-0.002864,-0.011408,...,0.485249,-3.216897,47.718132,-388.374462,-0.026454,12.179558,1.726411,-92.905411,4.066955,-0.002123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-10-15,0.012893,-0.006731,0.006832,-0.010510,0.007455,-0.029745,-0.006650,-0.002693,0.005821,-0.045929,...,4.175658,1.640256,-7100.908714,36.970795,-0.121360,-7.231287,-11.736255,-0.000449,5.902314,0.000280
2025-10-16,-0.011741,0.016168,-0.008785,-0.009457,0.002746,0.044907,0.017074,-0.009627,-0.012412,0.012751,...,0.303656,-7.541354,10278.014393,-19.103001,0.050274,0.682697,5.961892,0.001881,0.308520,-0.000748
2025-10-17,0.006141,0.011155,0.019497,-0.005371,-0.005273,-0.008507,-0.003671,0.000789,0.007701,-0.016726,...,4.730884,8.055530,-1749.259492,186.898850,-0.068600,-1.237477,-7.931162,-0.000591,2.933647,-0.000685
2025-10-20,0.010921,-0.003557,-0.002345,0.015290,-0.000908,0.006294,-0.008614,0.018853,0.012333,-0.000159,...,4.654477,13.051872,593.273461,0.835997,-0.025578,-3.099113,-5.932125,0.000657,3.291221,0.000350


In [16]:
vr_cov_ls[-1]

Unnamed: 0,country,石油石化,非银金融,机械设备,银行,房地产,通信,国防军工,医药生物,基础化工,...,size,beta,momentum,residual_volatility,non_linear_size,book_to_price_ratio,liquidity,earnings_yield,growth,leverage
country,6.433873e-05,6e-06,1.193158e-05,2e-06,2.241805e-06,-1.653503e-05,9e-06,-5e-06,-1.249939e-05,-6e-06,...,0.002598,0.051157,-2.396482,0.236268,3.7e-05,-0.003565,-0.005202,-0.002634,-0.001327,3.043315e-06
石油石化,5.571029e-06,0.000122,-2.968797e-05,-1.4e-05,3.448102e-06,3.487358e-05,-1.3e-05,-2.6e-05,-4.575355e-05,3e-06,...,-0.016305,0.009665,-0.1974546,-0.011269,-0.000192,-0.014503,0.011243,0.007468,0.007506,-1.673636e-05
非银金融,1.193158e-05,-3e-05,0.0001045508,1e-06,1.227354e-07,-3.138012e-05,1e-05,-1.8e-05,-3.32787e-06,-1.1e-05,...,-0.002772,-0.057608,-17.47642,0.505143,3.9e-05,-0.0295,-0.011693,-0.001199,-0.007895,8.841021e-06
机械设备,1.912557e-06,-1.4e-05,1.112073e-06,0.00011,-3.112245e-05,-5.259681e-05,2.7e-05,5.8e-05,1.196239e-05,2.7e-05,...,0.015767,-0.012981,-1.605337,0.115951,0.000174,0.036778,-0.002632,0.001775,0.007901,1.106812e-05
银行,2.241805e-06,3e-06,1.227354e-07,-3.1e-05,8.698761e-05,1.823468e-05,-3.1e-05,-8.4e-05,-9.262883e-05,-5.1e-05,...,-0.014951,-0.02333,8.571628,0.012432,-1.2e-05,-0.094479,0.009903,-0.002112,0.000588,6.199528e-06
房地产,-1.653503e-05,3.5e-05,-3.138012e-05,-5.3e-05,1.823468e-05,0.0006111164,-3e-05,-9.7e-05,-0.0001098683,-1.8e-05,...,-0.025717,0.049486,25.11386,0.148343,0.000107,-0.067945,-0.000787,-0.002974,-0.011662,-5.963024e-05
通信,9.49965e-06,-1.3e-05,9.953349e-06,2.7e-05,-3.092364e-05,-2.988803e-05,0.000164,4.7e-05,1.901013e-05,2e-06,...,0.02272,0.040529,-6.536877,-0.02328,0.000185,0.029452,-0.030661,0.003045,-0.007542,6.717908e-06
国防军工,-4.574889e-06,-2.6e-05,-1.819423e-05,5.8e-05,-8.368132e-05,-9.698319e-05,4.7e-05,0.000328,0.0001112504,4e-05,...,0.042758,0.069331,-4.745177,-0.551827,0.000157,0.119265,-0.012325,0.006206,-0.008223,1.071562e-05
医药生物,-1.249939e-05,-4.6e-05,-3.32787e-06,1.2e-05,-9.262883e-05,-0.0001098683,1.9e-05,0.000111,0.0003844977,4.9e-05,...,0.021968,0.039914,-13.91204,-0.517213,-0.00012,0.117111,-0.0127,0.002922,0.003831,-2.394794e-07
基础化工,-5.735431e-06,3e-06,-1.126373e-05,2.7e-05,-5.05896e-05,-1.849036e-05,2e-06,4e-05,4.900343e-05,0.000189,...,0.004467,0.020582,-4.81708,0.148951,-3.9e-05,0.059575,0.000567,0.00497,0.009476,1.091062e-06


In [19]:
R2

Unnamed: 0,R2
2020-04-09,0.919952
2020-04-10,0.749177
2020-04-13,0.826531
2020-04-14,0.869658
2020-04-15,0.841662
...,...
2025-10-15,0.857387
2025-10-16,0.826028
2025-10-17,0.586851
2025-10-20,0.781741


In [29]:
# --- 【新增功能】保存输出结果 ---
print("\n\n===================================保存计算结果===================================")


# 2. 保存因子收益率
factor_ret.to_csv('results/factor_returns.csv')
print(f"- 因子收益率已保存到: results/factor_returns.csv")

# 3. 保存R平方
R2.to_csv('results/r_squared.csv')
print(f"- 模型R平方已保存到: results/r_squared.csv")




# 4. 合并并保存个股特异性收益率
specific_ret_list = []
for i in range(len(specific_ret)):
    long_format_series = specific_ret[i].stack()
    # 4. 重置索引并重命名列，得到最终结果
    final_df = long_format_series.reset_index()
    final_df.columns = ['date', 'ts_code', 'specific_ret']
    specific_ret_list.append(final_df)

specific_ret_df = pd.concat(specific_ret_list)
specific_ret_df.to_csv('results/specific_returns.csv')
print(f"- 最终因子协方差矩阵已保存到: results/specific_returns.csv")
# 5. 保存最后一次的因子协方差矩阵
if vr_cov_ls:
    final_cov_matrix = vr_cov_ls[-1]
    final_cov_matrix.to_csv('results/final_vol_regime_adj_covariance.csv')
    print(f"- 最终因子协方差矩阵已保存到: results/final_vol_regime_adj_covariance.csv")

# 6. 保存波动调节乘数 Lambda
if lamb:
    lambda_series = pd.Series(lamb, index=model.sorted_dates, name='lambda')
    lambda_series.to_csv('results/volatility_multiplier_lambda.csv')
    print(f"- 波动调节乘数Lambda已保存到: results/volatility_multiplier_lambda.csv")

print("\n所有结果保存完毕。")



- 因子收益率已保存到: results/factor_returns.csv
- 模型R平方已保存到: results/r_squared.csv
- 最终因子协方差矩阵已保存到: results/specific_returns.csv
- 最终因子协方差矩阵已保存到: results/final_vol_regime_adj_covariance.csv
- 波动调节乘数Lambda已保存到: results/volatility_multiplier_lambda.csv

所有结果保存完毕。


In [30]:
specific_ret_df

Unnamed: 0,date,ts_code,specific_ret
0,2020-04-09,000001.SZ,0.006181
1,2020-04-09,000002.SZ,0.008578
2,2020-04-09,000063.SZ,-0.005907
3,2020-04-09,000100.SZ,0.012333
4,2020-04-09,000157.SZ,0.018766
...,...,...,...
283,2025-09-19,688271.SH,-0.019365
284,2025-09-19,688396.SH,-0.006911
285,2025-09-19,688472.SH,-0.034700
286,2025-09-19,688599.SH,-0.021031
