In [1]:
import sys
sys.path.append('..')
import pandas as pd
import numpy as np
import talib as ta
import statsmodels.api as sm
from src import config

converter = config.ticker_to_en_name
sector_df = pd.read_hdf('../data/cleaned_data.h5').droplevel(0)
shanghai_composite = pd.read_hdf('../data/shanghai_composite.h5', key='shanghai_composite')
factors = pd.read_hdf('../data/ch3factors.h5', key='factors')

# Function to add stars to p-values
def significance_stars(p):
    if p < 0.01:
        return "***"
    elif p < 0.05:
        return "**"
    elif p < 0.1:
        return "*"
    else:
        return ""

factors.index = factors.index.to_period('M')
factors.drop(columns=['rf_mon'], inplace=True)
sector_df_monthly = sector_df.resample('M').last()
sector_df_monthly.index = sector_df_monthly.index.to_period('M')
sector_df_monthly = sector_df_monthly.pct_change().dropna()

# make the length of both df_monthly and factors the same, one is longer at one end, the other is longer at the other end
factors = factors.loc[sector_df_monthly.index[0]:factors.index[-1]]
sector_df_monthly = sector_df_monthly.loc[factors.index[0]:factors.index[-1]]

In [2]:
window = 12

## PLOT the rolling alphas

In [3]:
# Create a MultiIndex DataFrame to store the results
index = pd.MultiIndex.from_product([[],[]], names=['ticker', 'date'])
factor_exposures_rolling = pd.DataFrame(index=index, 
                                   columns=factors.columns.append(pd.Index(['alpha'])))

for i in range(window, len(sector_df_monthly)):
    # Get the data for the current window
    window_factor_returns = factors.iloc[i-window:i]
    window_asset_returns = sector_df_monthly.iloc[i-window:i]

    for ticker in sector_df_monthly.columns:
        # Fit a linear regression model
        X = sm.add_constant(window_factor_returns)
        model = sm.OLS(window_asset_returns[ticker], X)
        results = model.fit()

        # Store the factor exposures and alpha
        factor_exposures_rolling.loc[(ticker, sector_df_monthly.index[i]), factors.columns] = results.params[1:]
        factor_exposures_rolling.loc[(ticker, sector_df_monthly.index[i]), 'alpha'] = results.params[0]
# convert the index to datetime
factor_exposures_rolling.index = factor_exposures_rolling.index.set_levels(factor_exposures_rolling.index.levels[1].to_timestamp(), level=1)
# plot with filling the area between the lines and zero
factor_exposures_rolling_pivot = factor_exposures_rolling[['alpha']].pivot_table(index=factor_exposures_rolling.index.get_level_values(1), 
                                           columns=factor_exposures_rolling.index.get_level_values(0)).droplevel(0, axis=1)
factor_exposures_rolling_pivot.columns = factor_exposures_rolling_pivot.columns.map(converter, na_action='ignore')
# ax = factor_exposures_rolling_pivot.plot(kind='area', 
#                                         stacked=False,
#                                         figsize=(12, 8),
#                                     )
# ax.set_xlabel("")
# ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=5, fancybox=True)

In [4]:
# calculate the mean and stds for the rolling alphas annualized
alpha_mean_std = pd.DataFrame(index=factor_exposures_rolling_pivot.columns, columns=['mean', 'std'])
for sector in factor_exposures_rolling_pivot.columns:
    mean, std = factor_exposures_rolling_pivot[sector].dropna().agg(['mean', 'std'])
    alpha_mean_std.loc[sector] = mean*window, std*np.sqrt(window)
alpha_mean_std = alpha_mean_std * 100
alpha_mean_std.rename(columns={'mean': 'Mean(%)', 'std': 'Std(%)'}, inplace=True)
alpha_mean_std = alpha_mean_std.convert_dtypes(float).round(2)

In [5]:
# check for how long does the alpha stay positive, in general
factor_exposures_rolling_pivot_positive = factor_exposures_rolling_pivot > 0
factor_exposures_rolling_pivot_positive = factor_exposures_rolling_pivot_positive.sum()
factor_exposures_rolling_pivot_positive = factor_exposures_rolling_pivot_positive / 210 * 100
factor_exposures_rolling_pivot_positive = factor_exposures_rolling_pivot_positive.convert_dtypes(float).round(2)

In [6]:
factor_exposures_rolling_pivot_negative = factor_exposures_rolling_pivot < 0
factor_exposures_rolling_pivot_negative = factor_exposures_rolling_pivot_negative.sum()
factor_exposures_rolling_pivot_negative = factor_exposures_rolling_pivot_negative / 210 * 100
factor_exposures_rolling_pivot_negative = factor_exposures_rolling_pivot_negative.convert_dtypes(float).round(2)

In [7]:
neg_pos_comparison = pd.concat([factor_exposures_rolling_pivot_positive, factor_exposures_rolling_pivot_negative], axis=1).rename(columns={0: 'Positive(%)', 1: 'Negative(%)'})

## NEXT we calculate noise using the TNR Ratio

In [8]:
tnr_window = 12

In [9]:
# calculate trend to noise ratio
tnr_df = pd.DataFrame(index=sector_df_monthly.index, columns=sector_df_monthly.columns)
for ticker in sector_df_monthly.columns:
    tnr_df[ticker] = sector_df_monthly[ticker].rolling(tnr_window).apply(lambda x: x.sum() / x.diff().abs().sum())
tnr_df.dropna(inplace=True)
tnr_df.columns = tnr_df.columns.map(converter, na_action='ignore')

In [10]:
tnr_df = tnr_df.loc[factor_exposures_rolling_pivot.index.to_period('M')]
factor_exposures_rolling_pivot.index = factor_exposures_rolling_pivot.index.to_period('M')

In [11]:
# calculate regression between the alphas and the trend to noise ratio with lags
alpha_tnr_reg = pd.DataFrame(index=tnr_df.columns, columns=[('tnr_beta' + str(i), 'p_tnr' + str(i)) for i in range(1, 13)])
for i in range(1, 1+window):
    for sector in tnr_df.columns:
        X = sm.add_constant(tnr_df[sector].shift(i).fillna(0))
        model = sm.OLS(factor_exposures_rolling_pivot[sector], X)
        results = model.fit()
        alpha_tnr_reg.loc[sector][('tnr_beta' + str(i), 'p_tnr' + str(i))] = (results.params[1], results.pvalues[1])

In [12]:
# round each value to 3 decimal places, each value is a tuple
alpha_tnr_reg = alpha_tnr_reg.applymap(lambda x: tuple(map(lambda y: round(y, 3), x)))

# iterate through every value in the dataframe and add stars to the p-values, every value is in tuples so only apply to the second element
alpha_tnr_reg_wstar = alpha_tnr_reg.applymap(lambda x: (x[0], significance_stars(x[1])))
# alpha_tnr_reg_wstar.to_csv('../data/alpha_tnr_reg_wstar.csv')
# calculate how many betas are significant out of every value in the matrix
alpha_tnr_reg_wstar_significant = alpha_tnr_reg_wstar.applymap(lambda x: x[1] == '***')
alpha_tnr_reg_wstar_significant.sum().sum() / alpha_tnr_reg_wstar_significant.size

0.5416666666666666

In [13]:
alpha_tnr_reg_wstar

Unnamed: 0_level_0,"(tnr_beta1, p_tnr1)","(tnr_beta2, p_tnr2)","(tnr_beta3, p_tnr3)","(tnr_beta4, p_tnr4)","(tnr_beta5, p_tnr5)","(tnr_beta6, p_tnr6)","(tnr_beta7, p_tnr7)","(tnr_beta8, p_tnr8)","(tnr_beta9, p_tnr9)","(tnr_beta10, p_tnr10)","(tnr_beta11, p_tnr11)","(tnr_beta12, p_tnr12)"
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Agriculture,"(0.002, )","(0.002, )","(0.003, )","(0.002, )","(0.002, )","(0.002, )","(0.003, )","(0.003, )","(0.005, )","(0.007, *)","(0.008, **)","(0.009, **)"
Chemicals,"(0.003, )","(0.003, **)","(0.004, **)","(0.004, **)","(0.004, **)","(0.004, **)","(0.004, **)","(0.003, *)","(0.004, **)","(0.004, **)","(0.004, **)","(0.004, **)"
Steel,"(0.011, ***)","(0.011, ***)","(0.01, ***)","(0.008, ***)","(0.007, **)","(0.006, *)","(0.004, )","(0.003, )","(0.002, )","(0.001, )","(-0.0, )","(-0.001, )"
Nonferrous Metals,"(0.014, ***)","(0.014, ***)","(0.014, ***)","(0.013, ***)","(0.013, ***)","(0.013, ***)","(0.012, ***)","(0.011, ***)","(0.009, **)","(0.009, **)","(0.009, **)","(0.01, **)"
Electronics,"(0.009, ***)","(0.009, ***)","(0.009, ***)","(0.008, **)","(0.005, )","(0.003, )","(0.001, )","(-0.001, )","(-0.003, )","(-0.005, )","(-0.006, **)","(-0.008, ***)"
Household Appliances,"(0.009, ***)","(0.007, ***)","(0.005, **)","(0.003, )","(0.002, )","(0.0, )","(-0.002, )","(-0.004, )","(-0.006, **)","(-0.007, ***)","(-0.009, ***)","(-0.011, ***)"
Food & Beverage,"(0.006, *)","(0.005, )","(0.003, )","(0.001, )","(0.0, )","(-0.0, )","(-0.001, )","(-0.002, )","(-0.004, )","(-0.006, *)","(-0.007, **)","(-0.008, **)"
Textile & Clothing,"(0.002, )","(0.004, **)","(0.006, ***)","(0.007, ***)","(0.008, ***)","(0.009, ***)","(0.01, ***)","(0.01, ***)","(0.011, ***)","(0.012, ***)","(0.012, ***)","(0.012, ***)"
Light Manufacturing,"(0.005, ***)","(0.005, ***)","(0.005, ***)","(0.005, ***)","(0.004, ***)","(0.004, ***)","(0.004, ***)","(0.004, ***)","(0.003, **)","(0.002, *)","(0.001, )","(-0.0, )"
Healthcare,"(0.006, **)","(0.006, **)","(0.006, **)","(0.008, ***)","(0.009, ***)","(0.009, ***)","(0.009, ***)","(0.009, ***)","(0.007, ***)","(0.006, **)","(0.004, *)","(0.001, )"


## Now get on with backtesting
1st strategy w no noise reduction technique

In [14]:
sector_df_monthly_renamed = sector_df_monthly.copy()
sector_df_monthly_renamed.columns = sector_df_monthly.columns.map(converter, na_action='ignore')

In [15]:
# Create a DataFrame to store the results
portfolio_results_df = pd.DataFrame(columns=['mktrf', 'SMB', 'VMG', 'alpha', 'p_alpha'])

In [76]:
def get_reg_results(strategy_name, holding):
    # create dataframe from the dictionary    
    holding_df = pd.DataFrame.from_dict(holding, orient='index')
    # calculate the return of the portfolio
    portfolio_return = pd.DataFrame(index=holding_df.index, columns=[strategy_name])
    for idx, row in holding_df.iterrows():
        try:
            returns = sector_df_monthly_renamed.loc[idx, row.values].values
        except KeyError:
            returns = 0
        mean_return = np.mean(returns)
        portfolio_return.loc[idx] = mean_return
    portfolio_return[strategy_name] = pd.to_numeric(portfolio_return[strategy_name], errors='coerce')
    # Create a DataFrame to store the results
    portfolio_return = portfolio_return.loc[portfolio_return.index[0]:factors.index[-1]].dropna()

    X = sm.add_constant(factors.loc[portfolio_return.index])
    model = sm.OLS(portfolio_return[strategy_name], X)
    results = model.fit()

    # Store the coefficients
    portfolio_results_df.loc[strategy_name, ['mktrf', 'SMB', 'VMG']] = results.params[1:]
    portfolio_results_df.loc[strategy_name, 'alpha'] = results.params[0]

    # Store the p-values
    # portfolio_results_df.loc[ticker, ['p_mktrf', 'p_SMB', 'p_VMG']] = results.pvalues[1:]
    portfolio_results_df.loc[strategy_name, 'p_alpha'] = results.pvalues[0]

In [77]:
strategy_name = 'alpha_only'
holding = {}
for idx, row in factor_exposures_rolling_pivot.iterrows():
    highest_alpha_stock = row.idxmax()
    holding[idx+1] = highest_alpha_stock
get_reg_results(strategy_name, holding)

## Second try, with tnr

In [29]:
strategy_name = 'alpha_w_tnr'
holding = {}
for idx, row in factor_exposures_rolling_pivot.iterrows():
    positive_alpha_stocks = row[row > 0]
    # rank the positive alpha stocks according to their alpha values
    positive_alpha_stocks_ranked = positive_alpha_stocks.rank()
    tnr_values_ranked = tnr_df.loc[idx, positive_alpha_stocks.index].rank()
    holding[idx+1] = (positive_alpha_stocks_ranked + tnr_values_ranked).idxmax()
get_reg_results(strategy_name, holding)

## Third try, holding 2 top_alpha stocks

In [92]:
strategy_name = 'alpha_only_2_holdings'
holding = {}
for idx, row in factor_exposures_rolling_pivot.iterrows():
    positive_alpha_stocks = row.nlargest(2).index.tolist()
    holding[idx+1] = positive_alpha_stocks
    # tnr_df.loc[idx, positive_alpha_stocks].nlargest(2).index.tolist()
get_reg_results(strategy_name, holding)

In [84]:
portfolio_results_df['p_alpha'].apply(significance_stars)

alpha_only               ***
alpha_w_tnr              ***
alpha_only_2_holdings    ***
Name: p_alpha, dtype: object

### We got alpha which is very good. 
### Fourth try, tnr + alpha top 2

In [100]:
strategy_name = 'alpha_tnr_2_holdings'
holding = {}
for idx, row in factor_exposures_rolling_pivot.iterrows():
    positive_alpha_stocks = row[row > 0]
    # rank the positive alpha stocks according to their alpha values
    positive_alpha_stocks_ranked = positive_alpha_stocks.rank()
    tnr_values_ranked = tnr_df.loc[idx, positive_alpha_stocks.index].rank()
    holding[idx+1] = (positive_alpha_stocks_ranked + tnr_values_ranked).nlargest(2).index.values.tolist()
get_reg_results(strategy_name, holding)

In [102]:
strategy_name = 'alpha_tnr_4_holdings'
holding = {}
for idx, row in factor_exposures_rolling_pivot.iterrows():
    positive_alpha_stocks = row[row > 0]
    # rank the positive alpha stocks according to their alpha values
    positive_alpha_stocks_ranked = positive_alpha_stocks.rank()
    tnr_values_ranked = tnr_df.loc[idx, positive_alpha_stocks.index].rank()
    holding[idx+1] = (positive_alpha_stocks_ranked + tnr_values_ranked).nlargest(4).index.values.tolist()
get_reg_results(strategy_name, holding)

In [104]:
strategy_name = 'alpha_tnr_6_holdings'
holding = {}
for idx, row in factor_exposures_rolling_pivot.iterrows():
    positive_alpha_stocks = row[row > 0]
    # rank the positive alpha stocks according to their alpha values
    positive_alpha_stocks_ranked = positive_alpha_stocks.rank()
    tnr_values_ranked = tnr_df.loc[idx, positive_alpha_stocks.index].rank()
    holding[idx+1] = (positive_alpha_stocks_ranked + tnr_values_ranked).nlargest(6).index.values.tolist()
get_reg_results(strategy_name, holding)

In [105]:
portfolio_results_df

Unnamed: 0,mktrf,SMB,VMG,alpha,p_alpha
alpha_only,1.073732,-0.213216,-0.879737,0.013953,0.008551
alpha_w_tnr,0.946778,-0.181539,-0.777452,0.013852,0.007243
alpha_only_2_holdings,1.042008,-0.079863,-0.543879,0.011623,0.002491
alpha_tnr_2_holdings,1.022554,-0.111506,-0.603296,0.014631,0.000378
alpha_tnr_4_holdings,1.030304,0.042016,-0.337918,0.010381,5.5e-05
alpha_tnr_6_holdings,1.007338,0.136366,-0.295817,0.008293,0.000126
