In [75]:
import numpy as np
import pandas as pd 
pd.set_option('display.max_columns', None)
from typing import Dict

In [76]:
filepath = '../data/US_latest_data_20220428.parquet' 
df = pd.read_parquet(filepath, engine = 'pyarrow')

In [77]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    # select required months
    df = df[["DateYYYYMM","ws_id","Price","Div","mom","mktcap","median_volume_usd"]]
    df = df.rename(columns={"DateYYYYMM": "date", "ws_id":"asset", "Price":"price", "Div":"div",
           "mom":"factor"})
    
    # Correct datetime 
    df['date'] = pd.to_datetime(df['date'], format = "%Y%m")
    
    # Change units for dollar valued data
    df['mktcap'] = df['mktcap'] *1000000
    df['median_volume_usd'] = df['median_volume_usd'] *1000000
    
    # replace missing values
    df[['price', 'div']] = df[['price', 'div']].fillna(0)
    
    return df

In [78]:
df = clean_data(df)

In [79]:
# variables
ratio = 0.25
size = 1000  
starting_assets = 10**8

In [80]:
# testing 
df_time = df[df.date == df.date.min() + pd.DateOffset(months=1)]
df_by_mktcap = df_time.sort_values(by = 'mktcap', ascending=False).reset_index(drop=True)[:size]
df_by_factor = df_by_mktcap.sort_values(by = 'factor', ascending=False).reset_index(drop=True)[:250] 

In [81]:
def filter_eligible_investments(df: pd.DataFrame, ratio: float = 0.25, size: int = 1000) -> pd.DataFrame:
    """
    Return the dataframe containing the top ratio% assets based on the factor of the top 1000 assets  of top asset based on market cap
    """
    factor_num = int(ratio*size)
    # the above is for target, but for actual we allow for the assets we already own
    # for actual, if an asset is not in the top 25% and you dont own it you cannot buy it. 
    
    df_by_mktcap = df.sort_values(by = 'mktcap', ascending=False).reset_index(drop=True)[:size]
    df_by_factor = df_by_mktcap.sort_values(by = 'factor', ascending=False).reset_index(drop=True)[:factor_num] 
     
    return df_by_factor
    

In [131]:
def update_assets(df: pd.DataFrame, valuation: float) -> Dict:
    """
    Return the dictionary of asset_names as the key and price/quantity tuple as the value. The proportion how much to purchase is determined by the  
    """
    
    df['ratio'] = df['mktcap']/df['mktcap'].sum()
    df['quantity'] = np.floor(df['ratio']*valuation/df['price'])
    
    assets = dict(zip(df['asset'], zip(df['quantity'], df['price'])))
    return assets

In [141]:
target_portfolio = {
    df['date'].min(): {
        'valuation': starting_assets, # CASH+SHARES*PRICE + DIV
        'cash': starting_assets,
        'div_paid': 0, ## do we need to keep track of this given we are just going to add it to cash? if we remove we can store less in memory
        # 'index': 1, ## what is this for??????????????????
        'asset': {}, # {asset: (0,1) for asset in df.asset.unique()},
        # 'trade_cost': 0,
        # 'turnover': 0,
        # 'cash_flow': 0
    },
}


for date in pd.to_datetime(df['date'].sort_values().unique())[1:10]:
    df_current = df[df['date'] == date]
    # initialise current date key in portfolio
    target_portfolio[date] = {}
    
    # update dividents paid
    assets = target_portfolio[date-pd.DateOffset(months=1)]['asset'] 
    target_portfolio[date]['div_paid'] = sum([float(quantity_price[0]*df_current.loc[df_current['asset'] == asset_name, 'div']) for asset_name, quantity_price in assets.items() if asset_name in df_current.asset.to_list()])
    ### the fact that an asset can no longer be available in a new month and therefore you cannot get a divident from it causes issues, replacing it with zero
    
    # update valuation
    current_asset_value = sum([float(df_current.loc[df_current['asset'] == asset_name, 'price'] * quantity_price[0]) for asset_name, quantity_price in assets.items() if asset_name in df_current.asset.to_list()])

    target_portfolio[date]['valuation'] = current_asset_value + target_portfolio[date]['div_paid'] + target_portfolio[date - pd.DateOffset(months=1)]['cash'] 
    ### if an asset is not longer being traded can we assume that value is removed from our portfolio or do we assume that we gain that value in cash? 
    
    # subset to eligible assets names
    df_current = filter_eligible_investments(df_current)
    
    # update the assets to the preferred ones
    target_portfolio[date]['asset'] = update_assets(df_current, target_portfolio[date]['valuation'])
    
    # update cash amount
    target_portfolio[date]['cash'] = target_portfolio[date]['valuation'] - sum([quantity_price[0]*quantity_price[1] for asset_name, quantity_price in target_portfolio[date]['asset'].items()])

    

In [148]:
target_portfolio[pd.to_datetime("197206", format = "%Y%m")]

{'div_paid': 96543.1176270224,
 'valuation': 108755492.24590212,
 'asset': {"b'29145'": (25462.0, 1.34728047538017),
  "b'41419'": (280172.0, 2.01486167165388),
  "b'51086'": (597662.0, 1.34050293016485),
  "b'48063'": (71762.0, 1.02369568116771),
  "b'27107'": (51972.0, 1.25742656387193),
  "b'47220'": (23464.0, 1.36718829034366),
  "b'46922'": (223004.0, 1.44088737144685),
  "b'51449'": (54205.0, 1.0586122790985),
  "b'39773'": (54673.0, 1.21922055979941),
  "b'48880'": (87489.0, 1.56330159127962),
  "b'18091'": (107256.0, 2.7368410522017),
  "b'52097'": (67647.0, 0.944444),
  "b'43318'": (197336.0, 1.34328315576567),
  "b'50825'": (104282.0, 1.50906289941404),
  "b'47781'": (67630.0, 1.43722545667581),
  "b'47714'": (27515.0, 1.47698925748577),
  "b'21434'": (160399.0, 1.06992952802935),
  "b'49832'": (47594.0, 1.23474187231008),
  "b'26833'": (36736.0, 1.58772042528614),
  "b'42892'": (694824.0, 1.34848570940416),
  "b'50905'": (44652.0, 1.25233677312564),
  "b'51051'": (37701.0, 1