In [1]:
import numpy as np
import pandas as pd

### Parametros de configuración

In [2]:
START_DATE = '2015-01-01'
PARQUET_FILE_PATH = "resources/assignment_parquet.parquet"

### Lectura del parquet

In [3]:
parquet = pd.read_parquet(PARQUET_FILE_PATH)

### Calendario de rebalanceo

Generamos las fechas en las que se hará el rebalanceo

In [4]:
# Obtenemos un dataframe con las fechas, anhadiendo 2 columnas adicionales que representan el mes y el año de cada fecha
trading_dates = parquet[['date']].drop_duplicates().sort_values('date')
trading_dates['year'] = trading_dates['date'].dt.year
trading_dates['month'] = trading_dates['date'].dt.month

print(trading_dates.head(5))
print('---')
print(trading_dates.tail(5))

        date  year  month
0 2013-12-02  2013     12
1 2013-12-03  2013     12
2 2013-12-04  2013     12
3 2013-12-05  2013     12
4 2013-12-06  2013     12
---
           date  year  month
3054 2026-01-26  2026      1
3055 2026-01-27  2026      1
3056 2026-01-28  2026      1
3057 2026-01-29  2026      1
3058 2026-01-30  2026      1


In [5]:
# El último dia de cada mes es el máximo dentro de cada grupo año-mes
rebalancing_dates = trading_dates.groupby(['year', 'month'])['date'].max().reset_index()
rebalancing_dates = rebalancing_dates[rebalancing_dates['date'] >= START_DATE].reset_index(drop=True)

rebalancing_dates.head(10)

Unnamed: 0,year,month,date
0,2015,1,2015-01-30
1,2015,2,2015-02-27
2,2015,3,2015-03-31
3,2015,4,2015-04-30
4,2015,5,2015-05-29
5,2015,6,2015-06-30
6,2015,7,2015-07-31
7,2015,8,2015-08-31
8,2015,9,2015-09-30
9,2015,10,2015-10-30


## Selección de activos

In [6]:
# Añadimos año-mes a cada fila para trabajar a nivel mensual
parquet['year_month'] = parquet['date'].dt.to_period('M')
parquet.head()

Unnamed: 0,date,symbol,sector,industry,subsector,in_sp500,open,close,year_month
0,2013-12-02,A,Health Care,Life Sciences Tools & Services,Life Sciences Tools & Services,1,34.522495,34.354626,2013-12
1,2013-12-03,A,Health Care,Life Sciences Tools & Services,Life Sciences Tools & Services,1,34.167389,34.173847,2013-12
2,2013-12-04,A,Health Care,Life Sciences Tools & Services,Life Sciences Tools & Services,1,34.115738,34.593513,2013-12
3,2013-12-05,A,Health Care,Life Sciences Tools & Services,Life Sciences Tools & Services,1,34.445015,34.541862,2013-12
4,2013-12-06,A,Health Care,Life Sciences Tools & Services,Life Sciences Tools & Services,1,34.709728,35.400566,2013-12


In [7]:
# Membresía mensual: 1 si estuvo en el índice ese mes
in_sp500_by_month = (parquet.groupby(['symbol', 'year_month'])['in_sp500'].max().reset_index())
in_sp500_by_month.head()

Unnamed: 0,symbol,year_month,in_sp500
0,A,2013-12,1
1,A,2014-01,1
2,A,2014-02,1
3,A,2014-03,1
4,A,2014-04,1


In [8]:
universe_rows = []

for _, row in rebalancing_dates.iterrows():

    date = row['date']
    # Ventana 13 meses hacia atras
    end_period = (date - pd.DateOffset(months=1)).to_period('M')
    start_period = (date - pd.DateOffset(months=13)).to_period('M')


    # Filtramos filas dentro de la ventana con in_sp500 == 1
    mask = (
        (in_sp500_by_month['year_month'] >= start_period) &
        (in_sp500_by_month['year_month'] <= end_period) &
        (in_sp500_by_month['in_sp500'] == 1)
    )

    # Meses distintos por ticker
    months_by_symbol = in_sp500_by_month[mask].groupby('symbol')['in_sp500'].sum()

    # Elegibles: los que tienen los 13 meses completos
    eligible = months_by_symbol[months_by_symbol == 13].index.tolist()
    for ticker in eligible:
        universe_rows.append({'rebal_date': date, 'symbol': ticker})

universe = pd.DataFrame(universe_rows)

print("Activos elegibles por fecha de rebalanceo:")
print(universe.groupby('rebal_date')['symbol'].count())

Activos elegibles por fecha de rebalanceo:
rebal_date
2015-01-30    487
2015-02-27    487
2015-03-31    485
2015-04-30    486
2015-05-29    482
             ... 
2025-09-30    488
2025-10-31    490
2025-11-28    487
2025-12-31    487
2026-01-30    488
Name: symbol, Length: 133, dtype: int64


### Retornos

Generamos un dataframe con los retornos logaritmicos mensuales


In [13]:
# Extraemos el precio de cierre en cada fecha de rebalanceo
# (y también necesitamos meses anteriores para calcular retornos)

all_month_ends = trading_dates.groupby(['year', 'month'])['date'].max().reset_index()
monthly_closes = (parquet[parquet['date'].isin(all_month_ends['date'])]
                  [['date', 'symbol', 'close']]
                  .sort_values(['symbol', 'date'])
                  .copy())

monthly_closes['log_return'] = (
    monthly_closes
    .groupby('symbol')['close']
    .transform(lambda x: np.log(x / x.shift(1)))
)

print(f"Shape: {monthly_closes.shape}")
monthly_closes.head(10)

Shape: (103686, 4)


Unnamed: 0,date,symbol,close,log_return
20,2013-12-31,A,37.009304,
41,2014-01-31,A,37.63055,0.016647
60,2014-02-28,A,36.841049,-0.021204
81,2014-03-31,A,36.18745,-0.0179
102,2014-04-30,A,35.055504,-0.03178
123,2014-05-30,A,36.936718,0.052273
144,2014-06-30,A,37.346329,0.011029
166,2014-07-31,A,36.468586,-0.023783
187,2014-08-29,A,37.164276,0.018897
208,2014-09-30,A,37.133404,-0.000831


### Salvado
Guardamos los datos para poder acceder a ellos en notebooks posteriores sin tener que rehacer los cálculos

In [12]:
# 1. Retornos mensuales
monthly_closes.to_parquet("resources/monthly_closes.parquet", index=False)
# 2. Universo elegible por fecha de rebalanceo
universe.to_parquet("resources/universe.parquet", index=False)
# 3. Fechas de rebalanceo
rebalancing_dates.to_parquet("resources/rebalancing_dates.parquet", index=False)