In [1]:
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from sklearn.feature_selection import RFECV

class CustomRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, mode, random_state, index,
    dt_params_dist_1={
        'criterion': ['friedman_mse', 'absolute_error'],
        'max_depth': Integer(2, 10), 
        'min_samples_split': Integer(2, 20),
        'max_features': Categorical(['sqrt', 'log2', None]),
    },
    ada_params_dist={
        'loss': ['linear', 'square'],
        'n_estimators': Integer(10, 100),
        'learning_rate': Real(1e-2, 1e0, prior='log-uniform'),
    },
    dt_params_dist_2={
        'criterion': ['friedman_mse', 'absolute_error'],
        'max_depth': Integer(2, 5), # 用于蒸馏ada2，因此应该浅一些
        'min_samples_split': Integer(2, 20),
        'max_features': Categorical(['sqrt', 'log2', None]), #
    },
    bayes_search_param={
        'n_iter': 20,
        'n_points': 5, # 使用比较大的n_initial_points参数可以显著减少"The objective has been evaluated at this point before"的警告
        'cv': 5, 
        'scoring': 'neg_mean_squared_error', 
        'n_jobs': -1,
        'verbose': 1
    },
    RFECV_param={
    'estimator': None, 
    'step': 0.05, 
    'cv': 5, 
    'scoring': 'neg_mean_squared_error',
    'min_features_to_select': 10
    }
    ):
        self.mode = mode
        self.random_state = random_state
        self.index = index
        
        # trian模式需要训练，从而需要定义用于训练的模型
        if self.mode == 'train':
            # 定义各模型的超参数区间及搜索模型
            self.dt_params_dist_1 = dt_params_dist_1
            self.ada_params_dist=ada_params_dist
            self.dt_params_dist_2 = dt_params_dist_2
            self.bayes_search_param = bayes_search_param
            self.RFECV_param = RFECV_param

            # 定义模型
            self.dt = DecisionTreeRegressor(random_state=self.random_state)
            self.ada = AdaBoostRegressor(estimator=None, random_state=self.random_state)

            self.bayes_search = BayesSearchCV(
                estimator=self.dt, # 形式上的参数，fit()中会更改，相当于None
                search_spaces=self.dt_params_dist_1, # 形式上的参数，fit()中会更改，相当于None
                random_state=self.random_state
                ).set_params(self.bayes_search_param)
            
            self.rfecv = RFECV().set_params(self.RFECV_param)

        # test模式直接输入train模式得到的模型训练
        elif self.mode == 'test':
            pass

    def fit(self, X, y, ada_best=None):
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()

            # 如果传入的y是二维数组，大小为(m, 1)
            if len(y.shape) == 2:
            # 将其转化为一维数组，大小为(m,)
                y = y.ravel()

        if self.mode == 'train':
            # 传递模型的pipeline
            # self.model在fit前都需要转变为clone(self.model)防止__init__中的self.model被链接并拟合
            # 寻找最优决策树
            dt_1_bayes_search = clone(self.bayes_search).set_params(estimator=clone(self.dt), search_spaces=self.dt_param_dist_1) 
            # 将一个估计器（如决策树dt）传递给BayesSearchCV的estimator参数时，在BayesSearchCV拟合后，这个估计器也会被拟合，并被设置为性能最好的超参数组合下拟合得到的决策树
            # 所以也需要对dt进行clone，防止__init__中的self.dt被链接并拟合
            dt_1_bayes_search.fit(X, y)
            dt_1_best = dt_1_bayes_search.best_estimator_

            # 将最优决策树输入adaboost，寻找最优adaboost
            ada_1 = clone(self.ada).set_params(estimator=dt_1_best)
            ada_1_bayes_search = clone(self.bayes_search).set_params(estimator=ada_1, search_spaces=self.ada_param_dist)
            ada_1_bayes_search.fit(X, y)
            self.ada_best = ada_1_bayes_search.best_estimator_

            if X.shape[1] >= 2:
                rfecv = clone(self.rfecv).set_params(estimator=self.ada_best)
                rfecv.fit(X, y)
                X_selected = rfecv.transform(X)

                # 将最优特征子集输入决策树，寻找最优决策树
                dt_2_bayes_search = clone(self.bayes_search).set_params(estimator=clone(self.dt), search_spaces=self.dt_param_dist_2)
                dt_2_bayes_search.fit(X_selected, y)
                dt_2_best = dt_2_bayes_search.best_estimator_

                # 将最优决策树输入adaboost，寻找最优adaboost
                ada_2 = clone(self.ada).set_params(estimator=dt_2_best)
                ada_2_bayes_search = clone(self.bayes_search).set_params(estimator=ada_2, search_spaces=self.ada_param_dist)
                ada_2_bayes_search.fit(X_selected, y)
                self.ada_best = ada_2_bayes_search.best_estimator_
                
        elif self.mode == 'test':
            self.ada_best = ada_best
            
        return self # 为了链式调用，即CustomRegressor.fit(X, y).xxx == self.xxx

    def predict(self, X):
        y_pred = pd.DataFrame(self.ada_best.predict(X))
        y_pred.index = self.index

        return y_pred
    
    # analog fit_transform from transformer
    def fit_predict(self, X, y, ada_best=None):
        self.fit(X, y, ada_best)
        y_pred = self.predict(X)

        return y_pred, self.ada_best

In [2]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import xgboost as xgb

class Modeler():

    def __init__(self, mode='train', factors_datas_names=[
        'factors_data',
        'fundamentals_data',
        'macros_data',
        'money_flows_data',
        'securities_margins_data',
        'industries_data',
        'indexes_data'
    ],
    other_datas_names=[
        'bank_stocks_info',
        'returns_data',
        'FCF_discounted_model_params_data'
    ],
    random_state=20240301
    ):
        self.mode = mode
        self.random_state = random_state
        # 将所有factors_datas以外的数据都定义为类属性
        for other_data_name in other_datas_names:
            other_data = pickle.load(open(f'{other_data_name}_{mode}.pkl', mode='rb+'))
            if other_data_name == 'bank_stocks_info':
                self.industry_stocks_info = other_data
            elif other_data_name == 'returns_data':
                self.returns_data = other_data
            elif other_data_name == 'FCF_discounted_model_params_data':
                self.FCF_discounted_model_params_data = other_data

        self.index = self.FCF_discounted_model_params_data['r_wacc'].index
        self.columns = self.FCF_discounted_model_params_data['r_wacc'].columns

        # 合并多个因子数据表
        factors_datas = {}
        # 导入数据
        for factors_data_name in factors_datas_names:
            factors_data = pickle.load(open(f'{factors_data_name}_{mode}.pkl', mode='rb+'))
            factors_datas[factors_data_name] = factors_data
        
        self.factors_datas = factors_datas
        pickle.dump(obj=factors_datas, file=open(file=f'factors_datas_{self.mode}.pkl', mode='wb+'), protocol=4)

    # 检查各因子是否为空表
    def check_factor_data_nan(self):
        factor_data_nan_dict = {}
        for factors_data_name, factors_data in tqdm(self.factors_datas.items(), desc='handling factors datas missing values progress'):
            factor_data_nan_dict[factors_data_name] = []
            for factor_data_name, factor_data in factors_data.items():
                if not factor_data.any().any():
                    factor_data_nan_dict[factors_data_name].append(factor_data_name)

        return factor_data_nan_dict
        

    # 定义标准化因子风险归类数据表
    def standardize_factors_risks_data(self):
        # 导入手动划分的因子风险归类数据表factors_risks_data，其有三列，分别是因子中文名称、因子代码和因子所属风险
        factors_risks_data = pd.read_csv('factors_risks_data.csv')

        # 检查factors_risks_data中是否有缺失的因子代码
        factors_codes_missing = []
        for _, factors_data in self.factors_datas.items():
            for factor_data_name in factors_data.keys():
                if factor_data_name not in factors_risks_data['factor_code'].values:
                    factors_codes_missing.append(factor_data_name)

        # 检查各因子代码是否有重复值

        # 定义因子代码列表
        factors_codes = []
        for factors_data in self.factors_datas.values():
            for factor_data_name in factors_data.keys():
                factors_codes.append(factor_data_name)

        # 定义列表重复值查找函数
        def find_duplicates(lst):
            duplicates = []
            unique_elements = set()

            for item in lst:
                if item in unique_elements:
                    duplicates.append(item)
                else:
                    unique_elements.add(item)

            return duplicates
        
        # 检查因子代码列表是否有重复值
        factors_codes_duplicated = find_duplicates(factors_codes)
        
        # 检查factors_risks_data是否有多余的因子代码
        factors_codes_excessive = []
        for factor_code in factors_risks_data['factor_code'].values:
            if factor_code not in factors_codes:
                factors_codes_excessive.append(factor_code)

        # 将factors_risks_data中的risk列标准化为0-1变量
        factors_risks_data = factors_risks_data.set_index('factor_code')
        factors_risks_data_standardized = pd.DataFrame(0, columns=['default_risk', 'liquidity_risk', 'market_risk'], index=factors_risks_data.index)
        for factor_code in factors_risks_data.index.tolist():
            if 'Default Risk' in factors_risks_data.loc[factor_code, 'risk']:
                factors_risks_data_standardized.loc[factor_code, 'default_risk'] = 1
            if 'Liquidity Risk' in factors_risks_data.loc[factor_code, 'risk']:
                factors_risks_data_standardized.loc[factor_code, 'liquidity_risk'] = 1
            if 'Market Risk' in factors_risks_data.loc[factor_code, 'risk']: 
                factors_risks_data_standardized.loc[factor_code, 'market_risk'] = 1

        self.factors_risks_data_standardized = factors_risks_data_standardized
        
        return factors_codes_missing, factors_codes_excessive, factors_codes_duplicated, factors_risks_data_standardized
    
    def _fill_nan_col(self, factor_data_without_type1_missing, factor_data):
        #print('factor_data_without_type1_missing', factor_data_without_type1_missing)
        # 找到factor_data_without_type1_missing中全为缺失值的列名
        missing_stocks_codes = factor_data_without_type1_missing.columns[factor_data_without_type1_missing.isnull().all()]
        #print('missing_stocks_codes', missing_stocks_codes)
        # 导出对应列在factor_data_without_type1_missing各行的分位数数据表，索引为缺失值列名
        missing_stocks_codes_quantiles = factor_data.rank(pct=True, axis=1)[missing_stocks_codes]
        # 计算对应列在factor_data_without_type1_missing各行的分位数据表的平均值series，索引为缺失值列名
        missing_stocks_codes_quantiles_mean = missing_stocks_codes_quantiles.mean()
        #print('missing_stocks_codes_quantiles_mean', missing_stocks_codes_quantiles_mean)
        # fill_values是factor_data_without_type1_missing中全为缺失值的各列在对应分位数平均值处的值series，索引为列名
        fill_values = factor_data_without_type1_missing.quantile(missing_stocks_codes_quantiles_mean, axis=1)

        # 兼容fill_values与factor_data_without_type1_missing
        fill_values = fill_values.T
        fill_values.columns = missing_stocks_codes

        # 用以上series填充对应缺失值列的缺失值
        factor_data_without_type1_missing.update(fill_values) # 不可将df赋给df，前者这样应该取values，变为np.array，但是还是会warning，所以使用update
        #print('fill_values', fill_values)
        #print('factor_data_without_type1_missing_filled', factor_data_without_type1_missing)

        return factor_data_without_type1_missing

    # 缺失值处理
    def _handle_missing_values(self, factor_data):
        '''factor_data中有些因子值缺失，而这样缺失值要么是由于个股在上市前或者退市后，因子值不存在；要么是因为个股在市期间，其因子值没有被披露或被统计。在进行缺失值
        处理时，忽略前一种缺失值，而填充后一种缺失值。
            填充缺失值一般有三种方法，即SimpleImputation，KNNImputation和IterativeImputation。对于本面板数据，SimpleImputation（如均值填充或中位数填充）可能不
        适合，因为它没有考虑时间序列的特性和个股之间的相关性。简单地用一个常数填充缺失值可能会引入偏差，尤其是当缺失值的比例较高时。而KNNImputation可以考虑个股之间的
        相关性，但它也没有考虑时间序列的特性。此外，KNNImputation在处理大规模面板数据时可能会比较慢，因为它需要计算所有个股之间的距离矩阵。所以应该选择IterativeImpu
        -tion。优缺点：考虑时间序列的特性，个股之间的相关性。
            然而，在使用IterativeImputer填充所有缺失值后再删除第一类缺失值可能不是最佳方案。这是因为IterativeImputer在估计缺失值时会考虑所有的特征，包括那些本不应
        该存在和被填充的第一类缺失值。这可能会影响估计的质量。
            所以，以下方案是更好的选择。首先，识别出那些不包含第一类缺失值的样本日期，并仅使用这些样本来训练IterativeImputer。然后，使用训练后的IterativeImputer来
        估计所有样本中的第二类缺失值。'''
        # 定义一个掩码mask,标识每个个股在每个时间点上是否处于上市状态
        mask = pd.DataFrame(index=self.index, columns=self.columns)
        # 传出因子数据表每一行axis=1的中位数，组成各截面的中位数向量meidians
        medians = factor_data.median(axis=1)
        # 对于每个代码为stock的个股
        for stock_code in mask.columns:
            # 对于代码为stock_code的个股，查找板块个股代码列表industry_stocks_info对应的个股代码，传出其上市日期start_date和退市日日期end_date
            start_date = self.industry_stocks_info.loc[stock_code, 'start_date']
            end_date = self.industry_stocks_info.loc[stock_code, 'end_date']
            # 标识代码为stock的个股在每个时间点上是否处于上市状态
            mask[stock_code] = (mask.index >= start_date) & (mask.index <= end_date)
            
            # 对于代码为stock_code的个股，如果其在各时间上的因子值factor_data[stock_code]均为缺失值np.nan
            if factor_data[stock_code].isnull().all():
                #print('factor_data', factor_data, 'stock_code', stock_code)
                # 将此代码为stock_code的个股其在各时间上的因子值factor_data[stock_code]传为各截面的中位数向量meidians，防止被Imputer忽略
                medians = pd.DataFrame(medians)
                medians.columns = [stock_code]
                factor_data.update(medians)
                #print('factor_data', factor_data)
        
        # 根据掩码mask，传出不包含第一类缺失值(不在市)的日期索引indexes_without_type1_missing
        indexes_without_type1_missing = mask.all(axis=1)
        # 定义不包含第一类缺失值的日期索引indexes_without_type1_missing对应的因子数据样本factor_data_without_type1_missing
        factor_data_without_type1_missing = factor_data.loc[indexes_without_type1_missing]

        # 对于factor_data_without_type1_missing中全为缺失值的列，得到其在整个数据factor_data中的分位数平均值，取此分位数在factor_data_without_type1_missing中各行对应的值填充缺失值列
        factor_data_without_type1_missing = self._fill_nan_col(factor_data_without_type1_missing, factor_data)
        
        # 定义IterativeImputer，所有缺失值被填充后需要再次加入训练，再次填充原有缺失值，直至缺失值收敛，这样的递归次数max_iter为10，随机种子random_state为self.random_state
        imputer = IterativeImputer(
            random_state=self.random_state,
            # 使用IsolationXFBoost填充缺失值
            estimator=xgb.XGBRegressor(),
            max_iter=50,
            tol=1e-3
            )
        
        # 使用factor_data_without_type1_missing来训练IterativeImputer
        imputer.fit(factor_data_without_type1_missing)
        # 利用imputer填充factor_data的全部缺失值，传出为填充后因子数据表factor_data_imputed
        factor_data_imputed = imputer.transform(factor_data)
        factor_data_imputed = pd.DataFrame(factor_data_imputed)
        factor_data_imputed.index = self.index
        factor_data_imputed.columns = self.columns

        # 将填充后因子数据表factor_data_imputed中的第一类缺失值重新标记为np.nan，即使用训练后的imputer来估计所有因子数据表中的第二类缺失值
        factor_data_imputed[~mask] = np.nan

        return factor_data_imputed
    
    # Fama-French-3分位数差值处理
    def _process_ff3_quantile_difference(self, factor_data):
        '''FF3处理形成截面股价收益率:
        合理性:如果您的研究目的是探究因子对股价收益率的截面预测能力,并且假设股价收益率的截面分布与因子的截面分布相关,那么按照FF3处理形成截面股价收益率是合适的。
        优点:这种方法能够消除股价收益率的极值影响,使得截面股价收益率的分布更加稳定,便于研究因子的预测能力。
        缺点:这种方法忽略了个股市值的影响,可能无法反映市场整体的收益率变化。'''
        panal_factor_data_quantiles = factor_data.quantile([0.3, 0.7], axis=1)
        panal_factor_data = panal_factor_data_quantiles.loc[0.7] - panal_factor_data_quantiles.loc[0.3]
        return panal_factor_data
    
    # 企业价值加权处理
    def _average_by_enterprise_value(self, factor_data):
        '''按个股市值加权形成截面股价收益率:
        合理性:如果您的研究目的是探究因子对市场整体收益率的预测能力,并且假设个股的市值反映了其在市场中的重要性,那么按个股市值加权形成截面股价收益率是合适的。
        优点:这种方法考虑了个股市值的影响,能够反映市场整体的收益率变化,更接近实际的投资组合收益。
        缺点:这种方法可能受到大市值股票的主导,小市值股票的影响可能被掩盖。
        就本文的银行板块研究目的来说，选择按个股市值加权形成截面股价收益率。'''
        weighted_factor_data = self.enterprise_value_weights * factor_data
        panal_factor_data = weighted_factor_data.sum(axis=1)

        panal_factor_data = pd.DataFrame(panal_factor_data)
        panal_factor_data.index = self.index
        #panal_factor_data.columns = [factor_data_name] 不重置列名，防止与其他panal_factor_data运算时因为列名不一致而出现两行缺失值

        return panal_factor_data

    # 数据清理，即缺失值处理和Fama-French-3分位数差值处理
    def clean_and_average_factors_datas(self):
        self.enterprise_value_weights = self.FCF_discounted_model_params_data['panal_enterprise_value_weights']
        industry_factors_datas = self.factors_datas.copy()

        for factors_data_name, factors_data in self.factors_datas.items():
            for factor_data_name, factor_data in tqdm(factors_data.items(), desc='handling factors data progress'):
                if factors_data_name != 'macros_data': #factor_data_name == 'PEG': 
                    # 缺失值处理
                    try:
                        factor_data_imputed = self._handle_missing_values(factor_data)
                    except:
                        print(factor_data_name, factors_data_name)
                        raise

                    # 企业价值加权处理
                    industry_factor_data = self._average_by_enterprise_value(factor_data_name, factor_data_imputed)
                    # 对r_waac以enterprise_value在截面的权重加权求和求出enterprise_value加权r_wacc，
                    #分别求出circulating_value加权r_E和Debts加权r_D，再以截面总circulating_value和总Debts在截面的总enterprise_value的权重加权得到enterprise_value加权r_wacc
                    # 二者过程等价
                    industry_factors_datas[factors_data_name][factor_data_name] = industry_factor_data
                
        self.industry_factors_datas = industry_factors_datas
        
        return industry_factors_datas
    
    def clean_and_average_r_waac_data(self):
        self.enterprise_value_weights = self.FCF_discounted_model_params_data['panal_enterprise_value_weights']
        r_waac = self.FCF_discounted_model_params_data['r_wacc']
        industry_r_waac_data = {}

        r_waac_imputed = self._handle_missing_values(r_waac)

        industry_r_waac = self._average_by_enterprise_value(r_waac_imputed)

        industry_r_waac_data['r_waac'] = industry_r_waac

        self.industry_r_waac_data = industry_r_waac_data

        return industry_r_waac_data

    # 取得系统性风险和非系统性风险风险溢价
    def get_common_and_idiosyncratic_risks_premium(self):
        # 载入数据
        r_wacc = self.industry_r_waac_data['r_waac']
        interest_rate_1m = self.FCF_discounted_model_params_data['interest_rate_1m']
        market_returns = self.returns_data['market_returns']

        # 定义r_waac风险溢价
        industry_risk_premium = r_wacc - interest_rate_1m
        market_risk_premium_common = market_returns - interest_rate_1m
        
        cur = CustomRegressor(mode=self.mode, random_state=self.random_state, index=self.index)
        if self.mode == 'train':
            self.industry_risk_premium_common, self.risks_premiums_totals_ada_best = cur.fit_predict(market_risk_premium_common, industry_risk_premium)
        elif self.mode == 'test':
            self.industry_risk_premium_common, self.risks_premiums_totals_ada_best = cur.fit_predict(market_risk_premium_common, industry_risk_premium, self.risks_premiums_totals_ada_best)
        self.industry_risk_premium_idiosyncratic = industry_risk_premium - self.industry_risk_premium_common

        return self.industry_risk_premium_common, self.industry_risk_premium_idiosyncratic, self.risks_premiums_totals_ada_best
    
    def transform_factors_datas_from_dict_to_df(self):
        industry_factors_dfs = pd.DataFrame(index=self.index)

        for industry_factors_data in self.industry_factors_datas.values():
            for factor_code, industry_factor_data in industry_factors_data.items():
                industry_factors_dfs[factor_code] = industry_factor_data
        
        self.industry_factors_dfs = industry_factors_dfs
        return industry_factors_dfs

    
    def get_common_and_idiosyncratic_risks_premiums_components(self): 
        risks_premiums_components = {'common_risk': {}, 'idiosyncratic_risk': {}}
        risks_premiums_components_ada_best = {'common_risk': {}, 'idiosyncratic_risk': {}}
        for risks_premiums_total, risks_premiums_total_name in zip(
            [self.industry_risk_premium_common, self.industry_risk_premium_idiosyncratic],
            ['common_risk', 'idiosyncratic_risk']
        ):
            for risks_premiums_component_name in ['default_risk', 'liquidity_risk','market_risk']:
                risks_premiums_component_factors_codes = self.factors_risks_data_standardized.loc[self.factors_risks_data_standardized[risks_premiums_component_name] == 1, 'factor_code'].tolist()
                risks_premiums_component_factors_dfs = self.industry_factors_dfs[risks_premiums_component_factors_codes]
                
                cur = CustomRegressor(mode=self.mode, random_state=self.random_state, index=self.index)
                if self.mode == 'train':
                    risks_premiums_component, risks_premiums_component_ada_best = cur.fit_predict(risks_premiums_component_factors_dfs, risks_premiums_total)
                elif self.mode == 'test':
                    risks_premiums_component, risks_premiums_component_ada_best = cur.fit_predict(risks_premiums_component_factors_dfs, risks_premiums_total, risks_premiums_component_ada_best)

                risks_premiums_components[risks_premiums_total_name][risks_premiums_component_name] = 1
                risks_premiums_components_ada_best[risks_premiums_total_name][risks_premiums_component_name] = 1

        self.risks_premiums_components = risks_premiums_components
        self.risks_premiums_components_ada_best = risks_premiums_components_ada_best
        return risks_premiums_components, risks_premiums_components


                            

In [3]:
import pickle
#from Modeler import Modeler

mode = input('Dataset is for train or test?')
mol = Modeler(mode)

In [4]:
factor_data_nan_dict = mol.check_factor_data_nan()

handling factors datas missing values progress: 100%|██████████| 7/7 [00:00<00:00, 61.21it/s]


In [5]:
try:
    mol.factors_risks_data_standardized = pd.read_csv('factors_risks_data_standardized.csv')
except:
    factors_codes_missing, factors_codes_excessive, factors_codes_duplicated, mol.factors_risks_data_standardized = mol.standardize_factors_risks_data()
    mol.factors_risks_data_standardized.to_csv('factors_risks_data_standardized.csv', encoding='utf-8')

In [6]:
try:
    mol.industry_factors_datas = pickle.load(open('bank_factors_datas_train.pkl', mode='rb+'))
except:
    mol.industry_factors_datas = mol.clean_and_average_factors_datas()
    pickle.dump(obj=mol.industry_factors_datas, file=open(file='bank_factors_datas_train.pkl', mode='wb+'), protocol=4)

In [8]:
try:
    mol.industry_r_waac_data = pickle.load(open('bank_r_waac_data_train.pkl', mode='rb+'))
except:
    mol.industry_r_waac_data = mol.clean_and_average_r_waac_data()
    pickle.dump(obj=mol.industry_r_waac_data, file=open(file='bank_r_waac_data_train.pkl', mode='wb+'), protocol=4)

In [7]:
try:
    mol.risks_premiums_totals_ada_best = pickle.load(open('risks_premiums_totals_ada_best.pkl', mode='rb+'))
except:
    _, _, mol.risks_premiums_totals_ada_best = mol.get_common_and_idiosyncratic_risks_premium()
    pickle.dump(obj=mol.risks_premiums_totals_ada_best, file=open(file='risks_premiums_totals_ada_best.pkl', mode='wb+'), protocol=4)

In [15]:
try:
    mol.factors_dfs = pd.read_csv('factors_dfs.csv')
except:
    mol.factors_dfs = mol.transform_factors_datas_from_dict_to_df()
    mol.factors_dfs.to_csv('factors_dfs.csv', encoding='utf-8')

In [None]:
factors_risks_data = pd.read_csv('factors_risks_data.csv', names=['facotr_name', 'factor_code', 'risk'])
factors_risks_data = factors_risks_data.loc[~factors_risks_data['factor_code'].isin(d)]

In [None]:
factors_data_nan_list = factor_data_nan_dict['factors_data']
fundamentals_data_nan_list = factor_data_nan_dict['fundamentals_data']
macros_data_nan_list = factor_data_nan_dict['macros_data']

factors_data = pickle.load(open('factors_data_train.pkl', 'rb+'))
fundamentals_data = pickle.load(open('fundamentals_data_train.pkl', 'rb+'))
macros_data = pickle.load(open('macros_data_train.pkl', 'rb+'))

factors_data = {key: value for key, value in factors_data.items() if key not in factors_data_nan_list}
fundamentals_data = {key: value for key, value in fundamentals_data.items() if key not in fundamentals_data_nan_list}
macros_data = {key: value for key, value in macros_data.items() if key not in macros_data_nan_list}

In [None]:
pickle.dump(obj=factors_data, file=open(file=f'factors_data_train.pkl', mode='wb+'), protocol=4)
pickle.dump(obj=fundamentals_data, file=open(file=f'fundamentals_data_train.pkl', mode='wb+'), protocol=4)
pickle.dump(obj=macros_data, file=open(file=f'macros_data_train.pkl', mode='wb+'), protocol=4)

In [None]:
#factors_risks_data = factors_risks_data.drop(columns=['index'])
factors_risks_data = factors_risks_data.reset_index(drop=True)
factors_risks_data.to_csv('factors_risks_data.csv', index=False, encoding='utf-8-sig')

In [None]:
factors_codes_missing, factors_codes_excessive, factors_codes_duplicated, factors_risks_data_standardized= mol.standardize_factors_risks_data()
factors_risks_data_standardized.to_csv('factors_risks_data_standardized.csv', index=True)