In [2]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
import warnings

warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('data/data.csv').set_index(['equity', 'date'])
df = df.drop(df.groupby(level=0).filter(lambda equity: (equity.isna().sum(axis=1) == len(equity.columns)).any()).index)
df = df.drop(df.groupby(level=0).filter(lambda equity: equity['PX_LAST'].eq(0).any()).index).reset_index()

df_factor_chosen: for every date, selected the best equities by the factor chosen \
column_to_order: name of factor \
number_of_equities: number of equities to select

In [4]:
df_returns = df[['equity','date','PX_LAST']]
df_returns.sort_values(by=['equity','date'], inplace=True)
df_returns = df_returns.pivot(index='date', columns='equity', values='PX_LAST')

In [5]:
df_log_returns = df_returns[df_returns.columns].apply(lambda x: np.log(x / x.shift(1)))
benchmark_rtn = df_log_returns.dropna(how='all').mean(skipna=True, axis='columns')

In [6]:
def select_equities(dataframe, factor, number_of_equities, df_rtn, bench_rtn):
    temp_dict = {}
    dates = list(set(dataframe['date']))
    for date in dates:
        temp_dict[date] =  dataframe.loc[dataframe['date'] == date].sort_values(by=['date',factor], ascending=False)['equity'].head(number_of_equities).values

    df_factor_chosen = pd.DataFrame.from_dict(temp_dict, orient='index').sort_index()

    df_factor_chosen_returns = df_factor_chosen.copy()
    sum_row_returns = 0
    list_sum_row_returns = []
    for index, row in df_factor_chosen.iterrows():
        for equity in row:
            sum_row_returns += np.nansum(df_rtn.loc[index][equity])
        list_sum_row_returns.append(round(sum_row_returns,2))
        sum_row_returns = 0

    df_factor_chosen_returns['returns'] = list_sum_row_returns
    df_factor_chosen_returns = df_factor_chosen_returns.iloc[1:,:]

    df_factor_chosen_returns['alpha'] = df_factor_chosen_returns['returns'] - bench_rtn
    information_ratio = df_factor_chosen_returns['alpha'].mean() / df_factor_chosen_returns['alpha'].std()

    return df_factor_chosen_returns, information_ratio

df_returns: for each equity, the date and the price

df_log_returns: log returns for each equity by doing log( price(t) / price(t+1) )

df_factor_chosen_returns: adding a column with returns for each date looking at the price of every equity selected.

In [7]:
# Calcolo equities e information ratio per ogni factor scelto
s_info_ratio = pd.Series(index=[['PE_RATIO', 'EBITDA_MARGIN', 'PX_TO_BOOK_RATIO', 'NORMALIZED_ACCRUALS_CF_METHOD', 'RSI_14D', 'VOLATILITY_30D', 'CUR_MKT_CAP', 'OPERATING_ROIC']], name='information_ratio')
equities_selected = {}

for factor in s_info_ratio.index:
    df_factor_chosen, information_ratio = select_equities(df, factor[0], 10, df_log_returns, benchmark_rtn)
    s_info_ratio.loc[factor[0]] = information_ratio
    equities_selected[factor[0]] = df_factor_chosen

s_info_ratio

PE_RATIO                         0.160940
EBITDA_MARGIN                    0.158318
PX_TO_BOOK_RATIO                 0.408200
NORMALIZED_ACCRUALS_CF_METHOD    0.118331
RSI_14D                          1.943153
VOLATILITY_30D                   0.246033
CUR_MKT_CAP                      0.133747
OPERATING_ROIC                   0.191919
Name: information_ratio, dtype: float64

In [8]:
# estrazione dei 4 factor con information ratio più alto
factor_to_use = s_info_ratio.sort_values(ascending=False).head(4).index.get_level_values(0).values

In [9]:
factor_to_use

array(['RSI_14D', 'PX_TO_BOOK_RATIO', 'VOLATILITY_30D', 'OPERATING_ROIC'],
      dtype=object)

In [17]:

# aggiungo la colonna zscore aggregata per i fattori scelti
df['zscore'] = df.set_index(['equity', 'date'])[factor_to_use].groupby(level=0).apply(lambda x: zscore(x.loc[:, factor_to_use]).mean(axis=1)).values

In [18]:
zscore_equities, zscore_info_ratio = select_equities(df, 'zscore', 10, df_log_returns, benchmark_rtn)

In [19]:
zscore_info_ratio

0.8978267783676801