In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from fake_useragent import UserAgent

In [2]:
def get_wayback_snapshots(url):
    cdx_api_url = f'http://web.archive.org/cdx/search/cdx?url={url}&output=json&fl=timestamp'
    response = requests.get(cdx_api_url)
    response.raise_for_status()

    data = response.json()
    timestamps = [item[0] for item in data[1:]]
    return timestamps

In [3]:
timestamps = get_wayback_snapshots('https://www.slickcharts.com/sp500')
timestamps = [(i, i.date()) for i in
              (datetime.strptime(ts, "%Y%m%d%H%M%S") for ts in timestamps)
              if i.year >= 2017]

#### removing multiple accesses in the same day

In [5]:
aux = {}
for ts, dt in timestamps:
    aux[dt] = ts

In [6]:
timestamps = [i.strftime("%Y%m%d%H%M%S") for i in sorted(list(aux.values()))]

In [7]:
timestamps

['20170327190927',
 '20170516200708',
 '20170529105230',
 '20170616192824',
 '20170703231238',
 '20170727115547',
 '20170810182251',
 '20170824162203',
 '20171005014857',
 '20171027233029',
 '20171028145733',
 '20171029012519',
 '20180102152401',
 '20180113053623',
 '20180212075956',
 '20180328201631',
 '20180404153948',
 '20180413112314',
 '20180418172200',
 '20180425172358',
 '20180502212521',
 '20180509132553',
 '20180517022818',
 '20180524161030',
 '20180531122627',
 '20180617192846',
 '20180620151249',
 '20180627110157',
 '20180703193826',
 '20180704135645',
 '20180725074127',
 '20180727171258',
 '20180729222141',
 '20180801131346',
 '20180804191850',
 '20180808125551',
 '20180815142426',
 '20180822154601',
 '20180826021353',
 '20180829141903',
 '20180905152426',
 '20180908180244',
 '20180912122645',
 '20180919094915',
 '20180927082645',
 '20181004114805',
 '20181009030139',
 '20181010104324',
 '20181017111658',
 '20181018221627',
 '20181019112135',
 '20181024113700',
 '2018102819

In [11]:
def spx_comp_asof(asof):
    url = f'https://web.archive.org/web/{asof}/https://www.slickcharts.com/sp500'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', {'class': 'table'})

    header = []
    for th in table.find_all('th'):
        header.append(th.text.strip())

    rows = []
    for row in table.find_all('tr')[1:]:
        rows.append([td.text.strip() for td in row.find_all('td')])

    sp500_data = pd.DataFrame(rows, columns=header)
    sp500_data = sp500_data[['Symbol', 'Weight']]
    sp500_data['dt'] = datetime.strptime(asof, "%Y%m%d%H%M%S").date()
    sp500_data = sp500_data.rename({'Symbol': 'ticker', 'Weight': 'weight'}, axis=1)
    sp500_data['weight'] = sp500_data['weight'].astype(float) / 100
    return sp500_data

In [13]:
def historical_comp(timestamps):
    dfs = []
    for ts in timestamps:
        print(f'Getting {ts}')
        try:
            dfs.append(spx_comp_asof(ts))
        except Exception as e:
            print(f'Error getting {ts}: {e}')

    return pd.concat(dfs).reset_index(drop=True)

In [14]:
df = historical_comp(timestamps)
df = df[~(df['ticker'] == '')].reset_index(drop=True)
df = df.pivot(columns='ticker', index='dt', values='weight').fillna(0)

Getting 20170327190927
Getting 20170516200708
Getting 20170529105230
Getting 20170616192824
Getting 20170703231238
Getting 20170727115547
Getting 20170810182251
Getting 20170824162203
Getting 20171005014857
Getting 20171027233029
Getting 20171028145733
Getting 20171029012519
Getting 20180102152401
Getting 20180113053623
Getting 20180212075956
Getting 20180328201631
Getting 20180404153948
Getting 20180413112314
Getting 20180418172200
Getting 20180425172358
Getting 20180502212521
Getting 20180509132553
Getting 20180517022818
Getting 20180524161030
Getting 20180531122627
Getting 20180617192846
Getting 20180620151249
Getting 20180627110157
Getting 20180703193826
Getting 20180704135645
Getting 20180725074127
Getting 20180727171258
Getting 20180729222141
Getting 20180801131346
Getting 20180804191850
Getting 20180808125551
Getting 20180815142426
Getting 20180822154601
Getting 20180826021353
Getting 20180829141903
Getting 20180905152426
Getting 20180908180244
Getting 20180912122645
Getting 201

In [15]:
df

ticker,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACGL,ACN,...,XLNX,XOM,XRAY,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-06-17,0.000896,0.000781,0.000405,0.040851,0.006637,0.000650,0.000784,0.004624,0.000000,0.004231,...,0.000757,0.014631,0.000436,0.000247,0.000540,0.001183,0.000984,0.000000,0.000469,0.001816
2018-06-20,0.000865,0.000738,0.000437,0.038937,0.006337,0.000646,0.000788,0.004661,0.000000,0.004478,...,0.000741,0.014573,0.000440,0.000246,0.000522,0.001117,0.000965,0.000000,0.000476,0.001810
2018-06-27,0.000847,0.000717,0.000441,0.038838,0.006117,0.000657,0.000759,0.004620,0.000000,0.004354,...,0.000720,0.014640,0.000445,0.000232,0.000511,0.001102,0.000971,0.000000,0.000473,0.001786
2018-07-03,0.000848,0.000691,0.000431,0.039764,0.006100,0.000609,0.000749,0.004621,0.000000,0.004535,...,0.000720,0.014958,0.000436,0.000219,0.000525,0.001091,0.000977,0.000000,0.000457,0.001800
2018-07-04,0.000849,0.000683,0.000434,0.039266,0.006163,0.000619,0.000747,0.004629,0.000000,0.004560,...,0.000712,0.015121,0.000443,0.000221,0.000524,0.001093,0.000983,0.000000,0.000454,0.001818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-09,0.000963,0.000276,0.000110,0.073877,0.006746,0.000784,0.000000,0.004953,0.000735,0.005474,...,0.000000,0.012490,0.000237,0.000000,0.000713,0.001041,0.000779,0.000400,0.000132,0.002127
2023-07-07,0.000949,0.000315,0.000114,0.076962,0.006579,0.000843,0.000000,0.005066,0.000740,0.005278,...,0.000000,0.011293,0.000225,0.000000,0.000707,0.001020,0.000802,0.000407,0.000113,0.002125
2023-07-19,0.000927,0.000319,0.000110,0.075263,0.006339,0.000817,0.000000,0.004894,0.000784,0.005351,...,0.000000,0.010721,0.000229,0.000000,0.000693,0.000988,0.000774,0.000423,0.000128,0.002089
2023-07-20,0.000952,0.000317,0.000114,0.075616,0.006375,0.000813,0.000000,0.004898,0.000776,0.005278,...,0.000000,0.010770,0.000229,0.000000,0.000695,0.001001,0.000772,0.000429,0.000135,0.002077


In [16]:
df.to_parquet('data/spx_comp.parquet')