In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import xgboost as xgb

In [23]:
data = pd.read_csv('factor_pass9.csv', index_col=0)
data['continuous_count'] = data.groupby((data['evebit'] != data['evebit'].shift(1)).cumsum()).cumcount() + 1
data = data.reindex(columns=data.columns.tolist()[:-2] + ['continuous_count', 'return'])
data.dropna(inplace=True)

insample_data = data.loc[data.loc[:, 'date'] < "2022-01-01", :]
insample_data_sorted = insample_data.sort_values('date')
insample_data_sorted['rank'] = insample_data_sorted['date'].rank()
sum_of_weight = (insample_data_sorted['rank']).sum()
insample_data_sorted['weight'] = insample_data_sorted['rank'] / sum_of_weight
outdsample_data = data.loc[data.loc[:, 'date'] >= "2022-01-01", :]

X = insample_data_sorted.iloc[:, 3:-3].astype(float)
y = insample_data_sorted.iloc[:, -3].astype(float)
other_info_insample = insample_data_sorted.iloc[:, :3]
other_info_outsample_test = outdsample_data.iloc[:, :3]
X_outsample_test = outdsample_data.iloc[:, 3:-1].astype(float)
y_outsample_test = outdsample_data.iloc[:, -1].astype(float)

In [24]:
lightGBM_pre = pd.read_csv('./result/lightGBM.csv', index_col=0)

In [25]:
from sklearn.metrics import mean_squared_error

print(mean_squared_error(lightGBM_pre["preds"], y_outsample_test))
print((np.corrcoef(lightGBM_pre["preds"].values.reshape(-1, 1), y_outsample_test.values.reshape(-1, 1))[0, 1])**2)

0.0009147840246896783
-6.405472712400547


In [21]:
XGBoost_pre = pd.read_csv('./result/XGBoost.csv', index_col=0)
print(mean_squared_error(XGBoost_pre["preds"], y_outsample_test))
print((np.corrcoef(XGBoost_pre["preds"], y_outsample_test)[0, 1])**2)

0.0007947575549582864
0.002069444350199031


In [6]:
def cal_ICIR(data: pd.DataFrame, feild: str) -> tuple[float, float]:
    """
    data is a dataframe with columns: date, return, factor feild
    feild is the factor name
    return IC and IR
    """
    data = data.loc[:, ['date', 'return', feild]]
    data.dropna(inplace=True)
    IC_dataframe = data.groupby('date').apply(lambda x: x.corr(method='spearman')[feild]['return'])
    return IC_dataframe.mean(), IC_dataframe.mean()/IC_dataframe.std()

def test_factor(ICIR: tuple[float, float]) -> str:
    """
    ICIR is a tuple of IC and IR
    return the test result
    """
    if abs(ICIR[0]) > 0.01 and abs(ICIR[1]) > 0.03:
        return 'pass'
    else:
        return 'fail'

In [9]:
lightGBM = joblib.load("./model/lightGBM.pkl")
lightGBM_insample_pre = lightGBM.predict(X)
print(mean_squared_error(lightGBM_insample_pre, y))
print((np.corrcoef(lightGBM_insample_pre, y)[0, 1])**2)
matrix_in = pd.concat([other_info_insample, pd.DataFrame(lightGBM_insample_pre, columns=['preds'], index = X.index), y], axis=1)
ICIR = cal_ICIR(matrix_in, "preds")
print(ICIR)

9.800372066852508e-05
0.9017003345453547
(0.8261400953101508, 4.80196378113545)


In [10]:
XGBoost = joblib.load("./model/XGBoost.pkl")
XGBoost_insample_pre = XGBoost.predict(xgb.DMatrix(X))
print(mean_squared_error(XGBoost_insample_pre, y))
print((np.corrcoef(XGBoost_insample_pre, y)[0, 1])**2)
matrix_in = pd.concat([other_info_insample, pd.DataFrame(XGBoost_insample_pre, columns=['preds'], index = X.index), y], axis=1)
ICIR = cal_ICIR(matrix_in, "preds")
print(ICIR)

0.0004617549888544504
0.6850150090626411
(0.5961660201591119, 1.7324273273604236)
