In [1]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
import warnings
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('data/data.csv').set_index(['equity', 'date'])
df = df.drop(df.groupby(level=0).filter(lambda equity: (equity.isna().sum(axis=1) == len(equity.columns)).any()).index)
df = df.drop(df.groupby(level=0).filter(lambda equity: equity['PX_LAST'].eq(0).any()).index).reset_index()

df_factor_chosen: for every date, selected the best equities by the factor chosen \
column_to_order: name of factor \
number_of_equities: number of equities to select

In [3]:
df_returns = df[['equity','date','PX_LAST']]
df_returns.sort_values(by=['equity','date'], inplace=True)
df_returns = df_returns.pivot(index='date', columns='equity', values='PX_LAST')

In [4]:
df_log_returns = df_returns[df_returns.columns].apply(lambda x: np.log(x / x.shift(1)))
benchmark_rtn = df_log_returns.dropna(how='all').mean(skipna=True, axis='columns')

In [5]:
def select_equities(dataframe, factor, number_of_equities, df_rtn, bench_rtn):
    temp_dict = {}
    dates = list(set(dataframe['date']))
    for date in dates:
        temp_dict[date] =  dataframe.loc[dataframe['date'] == date].sort_values(by=['date',factor], ascending=False)['equity'].head(number_of_equities).values

    df_factor_chosen = pd.DataFrame.from_dict(temp_dict, orient='index').sort_index()

    df_factor_chosen_returns = df_factor_chosen.copy()
    sum_row_returns = 0
    list_sum_row_returns = []
    for index, row in df_factor_chosen.iterrows():
        for equity in row:
            sum_row_returns += np.nansum(df_rtn.loc[index][equity])
        list_sum_row_returns.append(round(sum_row_returns,2))
        sum_row_returns = 0

    df_factor_chosen_returns['returns'] = list_sum_row_returns
    df_factor_chosen_returns = df_factor_chosen_returns.iloc[1:,:]

    df_factor_chosen_returns['alpha'] = df_factor_chosen_returns['returns'] - bench_rtn
    information_ratio = df_factor_chosen_returns['alpha'].mean() / df_factor_chosen_returns['alpha'].std()

    return df_factor_chosen_returns, information_ratio

df_returns: for each equity, the date and the price

df_log_returns: log returns for each equity by doing log( price(t) / price(t+1) )

df_factor_chosen_returns: adding a column with returns for each date looking at the price of every equity selected.

In [6]:
# Calcolo equities e information ratio per ogni factor scelto
s_info_ratio = pd.Series(index=[['PE_RATIO', 'EBITDA_MARGIN', 'PX_TO_BOOK_RATIO', 'NORMALIZED_ACCRUALS_CF_METHOD', 'RSI_14D', 'VOLATILITY_30D', 'CUR_MKT_CAP', 'OPERATING_ROIC']], name='information_ratio')
equities_selected = {}

for factor in s_info_ratio.index:
    df_factor_chosen, information_ratio = select_equities(df, factor[0], 10, df_log_returns, benchmark_rtn)
    s_info_ratio.loc[factor[0]] = information_ratio
    equities_selected[factor[0]] = df_factor_chosen

s_info_ratio

PE_RATIO                         0.160940
EBITDA_MARGIN                    0.158318
PX_TO_BOOK_RATIO                 0.408200
NORMALIZED_ACCRUALS_CF_METHOD    0.118331
RSI_14D                          1.943153
VOLATILITY_30D                   0.246033
CUR_MKT_CAP                      0.133747
OPERATING_ROIC                   0.191919
Name: information_ratio, dtype: float64

In [7]:
# estrazione dei 4 factor con information ratio più alto
factor_to_use = s_info_ratio.sort_values(ascending=False).head(4).index.get_level_values(0).values
factor_to_use

array(['RSI_14D', 'PX_TO_BOOK_RATIO', 'VOLATILITY_30D', 'OPERATING_ROIC'],
      dtype=object)

In [41]:
filter_n_equities = [300, 150, 100, 20]

Metodo 1: Utilizzo zscore come fattore per la strategia

In [43]:
df_filtered = df.copy()
for n_equities, factor in zip(filter_n_equities, factor_to_use):
    eq_filter = select_equities(df, factor, n_equities, df_log_returns, benchmark_rtn)[0]
    df_filtered = pd.concat([df_filtered.loc[(df_filtered['date'] == date) & (df_filtered['equity'].isin(equities.values))
                             ] for date, equities in eq_filter.iterrows()])

In [46]:
filter1 = select_equities(df, 'RSI_14D', filter_n_equities[0], df_log_returns, benchmark_rtn)[0]

In [47]:
filter1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,292,293,294,295,296,297,298,299,returns,alpha
2003-02-28,OLE SM,PIC BB,VSM GR,SIP BB,KUL GR,OEL GR,HKSAV FH,SO FP,RAP1V FH,IND IM,...,BB FP,REP SM,GBF GR,NHH SM,CRN3 GR,NEO FP,SDG FP,PUB FP,-5.35,-5.295722
2003-03-31,OLE SM,VSM GR,OEL GR,DRW3 GR,PIC BB,KUL GR,BIJ GR,HKSAV FH,ZOT SM,UPL SM,...,BAS GR,COF IM,AGL IM,CUN SM,GWI1 GR,CEN FP,ITX SM,SDG FP,-2.37,-2.344251
2003-04-30,OLE SM,VSM GR,OEL GR,BIJ GR,DRW3 GR,FPE GR,COP GR,PIC BB,UPL SM,GENP FP,...,PHH2 GR,POY1V FH,LPK GR,STF FP,CFG SM,AF FP,REC IM,PSM GR,28.67,28.559852
2003-05-30,OLE SM,COP GR,OEL GR,VSM GR,BIJ GR,DRW3 GR,UPL SM,FPE GR,SIS IM,ZOT SM,...,RHK GR,TRG SM,BB FP,IP IM,VPK NA,UMI BB,IPG IM,BEN FP,14.07,14.031772
2003-06-30,OLE SM,OEL GR,BIJ GR,VSM GR,COP GR,KUL GR,DRW3 GR,SIS IM,ZIL2 GR,FPE GR,...,SDG FP,MUK GR,MME GR,RAA GR,EAD FP,PSM GR,SPR GR,SGO FP,13.92,13.872080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011-11-30,LCA1 GR,EEX GR,VPK NA,BUE GR,GWI1 GR,MCM SM,WET GR,SGL GR,KIN BB,UNA NA,...,FRA GR,ZIL2 GR,SIOE BB,RES BB,EC FP,FUR NA,ENG IM,LPE FP,-4.89,-4.834679
2011-12-30,LCA1 GR,VPK NA,GWI1 GR,UNA NA,ZC FP,PRC FP,AGTA FP,BUE GR,EEX GR,SZU GR,...,LTO IM,ASY FP,TFA GR,UNIT4 NA,EC FP,ILK2S FH,DIE BB,JEN GR,2.26,2.268528
2012-01-31,LCA1 GR,VPK NA,GWI1 GR,AGTA FP,WOM GR,BUE GR,ZC FP,EEX GR,DUE GR,PRC FP,...,EO FP,ML FP,MED GR,BAS1V FH,HEIA NA,ZAR GR,ZOT SM,GND FP,22.88,22.808777
2012-02-29,LCA1 GR,GWI1 GR,VPK NA,DUE GR,WOM GR,ZC FP,AGTA FP,BB FP,WMF GR,ACOMO NA,...,PTER FP,EXE FP,LSS FP,REN NA,ACCEL NA,AKZA NA,DOU GR,H4G GR,20.55,20.503640


In [51]:
df_filtered = pd.concat([df.loc[(df['date'] == date) & (df['equity'].isin(equities.values))] for date, equities in filter1.iterrows()]).set_index(['equity', 'date']).sort_index().reset_index()

In [53]:
select_equities(df_filtered, 'RSI_14D', filter_n_equities[1], df_log_returns, benchmark_rtn)[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,142,143,144,145,146,147,148,149,returns,alpha
2003-03-31,OLE SM,VSM GR,OEL GR,DRW3 GR,PIC BB,KUL GR,BIJ GR,HKSAV FH,ZOT SM,UPL SM,...,GEM IM,COM FP,LSS FP,ANZ GR,NK FP,ECONB BB,PINF IM,NRO FP,0.44,0.465749
2003-04-30,OLE SM,VSM GR,OEL GR,BIJ GR,DRW3 GR,FPE GR,COP GR,PIC BB,UPL SM,GENP FP,...,KESBV FH,KRN GR,SO IM,LEM1S FH,RCH GR,KWS GR,EXE FP,ADS GR,12.53,12.419852
2003-05-30,OLE SM,COP GR,OEL GR,VSM GR,BIJ GR,DRW3 GR,UPL SM,FPE GR,SIS IM,ZOT SM,...,KESBV FH,RMR1V FH,VRAP FP,TESB BB,BOI FP,RAA GR,WMF GR,RBT FP,8.29,8.251772
2003-06-30,OLE SM,OEL GR,BIJ GR,VSM GR,COP GR,KUL GR,DRW3 GR,SIS IM,ZIL2 GR,FPE GR,...,NEO FP,ACOMO NA,VAN BB,SO IM,UNR1V FH,FAE SM,IMA IM,THEP FP,7.65,7.602080
2003-07-31,OLE SM,OEL GR,FPE GR,KUL GR,VSM GR,BIJ GR,SIS IM,DRW3 GR,COBH BB,ZOT SM,...,EVS BB,ES FP,IDR SM,CDI FP,ZC FP,ALPHY FP,VK FP,SAMS FP,9.91,9.863816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011-11-30,LCA1 GR,EEX GR,VPK NA,BUE GR,GWI1 GR,MCM SM,WET GR,SGL GR,KIN BB,UNA NA,...,VIB3 GR,CDI FP,BAS GR,MCL IM,SIM GR,PVA SM,BUR FP,ABE1 GR,0.84,0.895321
2011-12-30,LCA1 GR,VPK NA,GWI1 GR,UNA NA,ZC FP,PRC FP,AGTA FP,BUE GR,EEX GR,SZU GR,...,AEI GR,SSH GR,HUH1V FH,SBT FP,OR FP,BC8 GR,UMI BB,WRT1V FH,3.16,3.168528
2012-01-31,LCA1 GR,VPK NA,GWI1 GR,AGTA FP,WOM GR,BUE GR,ZC FP,EEX GR,DUE GR,PRC FP,...,MELE BB,BEI GR,ASU1V FH,MXH GR,TNFN FP,COK GR,FII FP,BRE IM,11.14,11.068777
2012-02-29,LCA1 GR,GWI1 GR,VPK NA,DUE GR,WOM GR,ZC FP,AGTA FP,BB FP,WMF GR,ACOMO NA,...,EC FP,ZIL2 GR,ABE1 GR,HEIA NA,JXR FP,MXH GR,CON GR,BVB GR,11.03,10.983640


nota per luca: puoi risolvere il punto 5 usando come peso per ogni fattore nello zscore come il valore della correlazione MEDIA del fattore con gli altri fattori