Imports:

In [1]:
import pandas as pd
import yfinance as yf
import matplotlib as mpl
import requests
import numpy as np
from io import StringIO
import time
import math

General Utility Functions:

In [2]:
def CleanTickers(Tickers: list): # Cleans Tickers to ensure formatting of Class B stocks align with yfinance tickers. 
    for i, Ticker in enumerate(Tickers):
        Tickers[i] = Ticker.replace(".", "-")
    return Tickers

def NormaliseData(Data: pd.DataFrame, DataType: str, NormaliseMagnitude: bool):
    if DataType != "Open" and DataType != "Close":
        print("Invalid DataType")
        return

    NewDataList = {}
    for i in Data[DataType]:
        NewDataList[i] = Data[DataType][i].reset_index(drop = True) #Indexes with numbers for convenience, dates are irrelevant as long as periods are consistent. 
        if NormaliseMagnitude == True:
            NewDataList[i] /= NewDataList[i].loc[0] # normalising magnitudes

    # print(pd.DataFrame(NewDataList)
    return pd.DataFrame(NewDataList)

    

Data Downloading (1 month for most recent components, 15m to ensure sufficient frequency):

In [3]:
#SNP500 CloseData
SNPData = NormaliseData(yf.download("^GSPC", period = "1mo", interval = "15m"), "Open", True)

  SNPData = NormaliseData(yf.download("^GSPC", period = "1mo", interval = "15m"), "Open", True)
[*********************100%***********************]  1 of 1 completed


In [29]:
#Company Data (Most Recent SNP Components), chose the maintained wikipedia page as the source. 
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
html = requests.get(url, headers=headers).text
df_list = pd.read_html(StringIO(html))
df = df_list[0]
tickers = CleanTickers(df['Symbol'].tolist())
# print(tickers)

CompanyDataTemp = NormaliseData(yf.download(tickers, period = "1mo", interval = "15m"), "Open", True)

  CompanyDataTemp = NormaliseData(yf.download(tickers, period = "1mo", interval = "15m"), "Open", True)
[*********************100%***********************]  503 of 503 completed


In [30]:
print(CompanyData)
print(SNPData)

            A      AAPL      ABBV      ABNB       ABT      ACGL       ACN  \
0    1.000000  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   
1    1.000882  1.001390  0.996075  0.998336  0.997873  0.998988  1.001542   
2    1.001002  1.000874  0.997280  1.001786  0.996112  1.000506  0.999155   
3    0.998737  0.998610  1.002304  0.996104  0.993471  1.003259  0.999134   
4    0.999278  0.998769  1.001895  0.994197  0.993765  1.008766  1.002154   
..        ...       ...       ...       ...       ...       ...       ...   
568  1.186723  1.035032  1.035589  1.040747  0.939701  0.981569  1.048450   
569  1.186483  1.036422  1.031633  1.035958  0.933319  0.981232  1.046127   
570  1.189850  1.038845  1.033815  1.039732  0.933209  0.978085  1.045894   
571  1.192496  1.040037  1.033270  1.040747  0.933612  0.978310  1.046211   
572  1.193458  1.044128  1.037089  1.041802  0.934419  0.978422  1.045873   

         ADBE       ADI       ADM  ...        WY      WYNN       XEL  \
0  

Here I try to use an Inverse Matrix to directly calculate the weights of each equity in the index. Given that the (normalised) index at every time period is a weighted average of its components' prices, if we get n periods of this data we arrive at a simulateous equation. If n = 503, we can construct a square matrix with an inverse (provided the rows are linearly independent). 

In theory this method gives an exact solution but upon completing it, it became clear that it is susceptible to small deviations in data, and given the replacement of NaN values in data becomes a very poor method.


In [51]:
def FindPrevValidValue(Data, n, i, j): # For finding nearest valid values to a NaN point
        if i + n in Data.index:
            if math.isnan(Data.loc[i + n,j]) == True:
                return FindPrevValidValue(Data, n, i + n, j)
            else:
                return Data.loc[i + n,j]
        else:
            return FindPrevValidValue(Data, -n, i, j)

def Generate503Matrix(Data: pd.DataFrame):
    Matrix503 = []
    NaNs = 0
    
    for i in Data.head(503).index: # Loop through every DataFrame one row at a time.
        row = []
        for j in Data.head(503):
            if math.isnan(Data.loc[i,j]) == True: # Check for Nan Value
                NaNs += 1
                row.append((FindPrevValidValue(Data, -1, i, j) + FindPrevValidValue(Data, -1, i, j))/2) #average of nearest 2 (1 if at edge) points.
            else:
                row.append(Data.loc[i,j])
        Matrix503.append(row)
        # print("Row ", i, " done", end="\r")
    # print("NaNs Modified: ", NaNs)
    return np.array(Matrix503)
            
def CleanDataNaN(Data: pd.DataFrame):
    NewData = Data
    for i in Data.index:
        for j in Data:
            if math.isnan(Data.loc[i,j]) == True:
                NewData[j][i] = ((FindPrevValidValue(Data, -1, i, j) + FindPrevValidValue(Data, -1, i, j))/2) #average of nearest 2 (1 if at edge) points.
    return NewData
                

In [52]:
ReturnMatrix = Generate503Matrix(CompanyData)
ReturnMatrixInverse = np.linalg.inv(ReturnMatrix)
#print(ReturnMatrixInverse)

SNPVector = Generate503Matrix(SNPData)
#print(CleanSNPData)
#print(Generate503Matrix(SNPData))

Weights = np.matmul(ReturnMatrixInverse, SNPVector)
print(Weights)

[[-3.88764526e-01]
 [-2.07307231e+00]
 [-9.87452884e-01]
 [ 3.19178885e-01]
 [ 2.07837146e+00]
 [-5.70442194e-01]
 [-6.97210278e-02]
 [ 6.87228532e-01]
 [-5.10434955e-01]
 [-4.35406534e-01]
 [ 3.21168217e+00]
 [-1.82160180e+00]
 [ 3.57680306e+00]
 [-2.48913158e+00]
 [-2.84561528e-01]
 [-1.24521663e-01]
 [-1.31116016e+00]
 [ 2.77741383e-01]
 [ 1.30246847e-01]
 [ 1.34905138e+00]
 [ 1.94229179e-01]
 [-1.26100289e+00]
 [-3.33406757e+00]
 [-1.84097615e+00]
 [-1.19705415e-01]
 [-3.31057939e-01]
 [-1.15615488e-01]
 [ 2.35949125e-01]
 [ 8.85537399e-01]
 [-4.77823870e-01]
 [-1.61706888e+00]
 [ 5.02952085e-01]
 [ 5.15703618e-01]
 [-9.07659719e-01]
 [-2.10546276e-01]
 [ 3.86676081e-01]
 [ 3.93596413e-01]
 [ 1.89205193e+00]
 [-2.49183356e-01]
 [-1.71709932e-01]
 [ 1.99874108e-01]
 [ 1.10831864e+00]
 [-5.20031702e-01]
 [ 2.03495446e+00]
 [-6.40723830e-01]
 [ 1.56699247e-01]
 [ 9.91882448e-01]
 [ 6.26961111e-02]
 [ 2.13817969e-01]
 [-1.02065485e+00]
 [ 1.37241926e+00]
 [-1.11760198e+00]
 [-5.9425301

Clearly, the results are completely inaccurate. If for no other reason, then because many of the weights are negative. We do, however, get weights that sum to 1 (see below).

The poor quality of results from this method is likely due to the small amount of data used to extract such a large amount of weights, the sensitivity of results to small changes in the data, and any distortion created by the way we replace NaN data. Do note that the reason we replace NaN values rather than disregarding the row is because almost every time frame has at least one NaN value in its respective row, so we ignore a huge amount of data by skipping rows with NaN values. Let's try an approach that utilises more data. Let's write some more general functions.

The example before used a 503x503 matrix with the first 503 rows of our company data. We will try to average out the result from many* possible combinations of 503 rows in our data, to use all of it. We'll see if this improves the results.

*Not all, since even with 572 rows (1mo of data at 15m freq.) there are an enormous (~8e^90) amount of combinations.

In [36]:
print(Weights.sum())

1.000000000016076


Warning! The cell below takes a while (~1min) to run

In [63]:
def WeightsFromMockMatrix(ComponentDataFrame: pd.DataFrame, IndexDataFrame: pd.DataFrame):
    ReturnMatrix = Generate503Matrix(ComponentDataFrame)
    ReturnMatrixInverse = np.linalg.inv(ReturnMatrix)
    IndexVector = Generate503Matrix(IndexDataFrame)

    return np.matmul(ReturnMatrixInverse, IndexVector)

def WeightsFromData(ComponentDataFrame: pd.DataFrame, IndexDataFrame: pd.DataFrame):
    length = len(ComponentDataFrame.index)
    if length != len(ComponentDataFrame.index):
        ComponentDataFrame =  ComponentDataFrame.head(min(len(ComponentDataFrame.index), len(ComponentDataFrame.index)))
        IndexDataFrame =  IndexDataFrame.head(min(len(ComponentDataFrame.index), len(ComponentDataFrame.index)))
        length = min(len(ComponentDataFrame.index), len(ComponentDataFrame.index))
    
    WeightList = []
    AverageWeight = []

    for i in range(length - 503 + 1):
        print(i, " loop completed", end="\r")
        WeightList.append(WeightsFromMockMatrix(ComponentDataFrame.tail(length - i).head(503), IndexDataFrame.tail(length - i).head(503)))

    for i in range(503):
        RowSum = 0
        for j in WeightList:
            RowSum += j[i][0]
        AverageWeight.append(RowSum/len(WeightList))
    return AverageWeight

Weights = WeightsFromData(CompanyData, SNPData)
print(np.array(Weights))

[-3.78064616e+00 -1.30955801e+01 -5.62293805e+00 -1.02510471e+01
  2.18173380e+00  2.36789261e+01  7.98464189e+00  5.90075724e+00
  1.23997484e+01 -8.70843082e+00 -1.29812741e+01  1.96067144e+01
  2.61158264e+01 -6.91644886e-01  6.14716007e+00 -1.84852541e+01
 -3.24989920e+00  1.16868176e+01  2.19946799e+01  8.11026133e+00
  3.50048481e-01 -1.69633837e+01 -1.64658338e+00  1.80312900e+01
 -7.69773680e+00 -1.25737229e+01 -4.79115184e+00 -1.17948045e+01
 -2.83516393e+00  6.25200510e+00 -8.50766274e+00 -4.22927737e-01
  5.26892398e+00 -2.62957979e+01 -3.17205894e+00  3.43730215e+00
  1.03298183e+01  1.78073455e+00  9.14244998e+00 -2.00889923e+00
  5.17206385e+00  3.49873624e+00 -2.20095379e+01 -3.72794797e+00
 -5.39790439e+00  3.46346652e+00 -1.13723799e+00 -4.14879717e+00
  6.46361934e+00 -2.44386417e+01  7.63102080e+00  2.07367027e+01
 -3.65823160e+00 -4.43653046e+00 -8.04670589e+00 -1.39961412e+01
  7.31883068e+00  7.25842424e+00  1.31661858e+00  4.92821860e+00
 -9.23616381e+00  1.97023

Evidently, the results are no better. In fact, they are arguably worse. I was hoping that by aggregating over more of the data, the values would converge to their true weights, but this is not the case. This rules out a lack of data use as the reason for the poor results. It seems instead that small discrepancies between the company data and snp data combined with the way we treat NaN entries distorts the data and means that this (clearly very sensitive) method becomes quite useless. We need a more robust way of dealing with errors. 

An alternative and probably more computationally efficient approach is by running an n-dimension (n=503) regression. By doing so, we accept a level of discrepancy between weights + company data and the snp 500 data, but minimise this discrepancy. In doing so, we may get much better results.