In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
import bs4 as bs
import requests
plt.style.use('fivethirtyeight')

In [None]:
# may need to install yfinance (an alternative to pandas_datareader library)
# %pip install yfinance

## Get tickers for S&P 500 constituents

In [None]:
# Scrap sp500 tickers
def save_sp500_tickers():

    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'html')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        if not '.' in ticker:
            tickers.append(ticker.replace('\n',''))
        
    return tickers

tickers = save_sp500_tickers()

In [None]:
tickers

In [None]:
# Save to file - Option A
with open('stock_symbol_list_SP500_A.txt', 'w') as filehandle:
    for listitem in tickers:
        filehandle.write('%s\n' % tickers)

In [None]:
# Save to file - Option B
with open('stock_symbol_list_SP500_B.txt', 'w') as filehandle:
    filehandle.writelines("%s\n" % listitem for listitem in tickers)

## Get data from Yahoo Finance

In [None]:
prices = yf.download(tickers, start='2018-01-01')['Adj Close'] 

In [None]:
pd.set_option("display.precision", 2)
prices.head(3)

In [None]:
ret = prices.apply(np.log).diff(1)

In [None]:
ret.plot(legend=0, figsize=(10,6), grid=True, title='Daily Returns of the Stocks in the S&P500')
plt.tight_layout()
plt.savefig('SP500returns.png')

In [None]:
(ret.cumsum().apply(np.exp)).plot(legend=0, 
                                  figsize=(10,6), 
                                  grid=True, 
                                  title='Cumulative Returns of the Stocks in the S&P500')
plt.tight_layout()
plt.savefig('SP500Creturns.png')

## Perform PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(1).fit(ret.fillna(0))

In [None]:
pc1 = pd.Series(index=ret.columns, data=pca.components_[0])

pc1.plot(figsize=(10,6), xticks=[], grid=True, title='First Principal Component of the S&P500')
plt.tight_layout()
#plt.savefig('PC1.png')

In [None]:
weights = abs(pc1)/sum(abs(pc1))
myret = (weights*ret).sum(1)
myret.cumsum().apply(np.exp).plot();

## Get SP500 index data and compare performance

In [None]:
SP500index = yf.download(['SPY'], start='2018-01-01')['Adj Close'] 

In [None]:
ret_df = pd.concat([myret, SP500index.apply(np.log).diff(1)], 1)
ret_df.columns = ["PCA Portfolio", "S&P500"]

ret_df.dropna().cumsum().apply(np.exp).plot(subplots=True, 
                                            figsize=(10,6), 
                                            grid=True, 
                                            linewidth=3);
plt.tight_layout()
#plt.savefig('ComparePerfromance.png')

In [None]:
fig, ax = plt.subplots(2,1, figsize=(10,6))
pc1.nsmallest(10).plot.bar(ax=ax[0], 
                           color='green', 
                           grid=True, 
                           title='Stocks with Most Negative PCA Weights')
pc1.nlargest(10).plot.bar(ax=ax[1], 
                          color='blue', 
                          grid=True, 
                          title='Stocks with Least Negative PCA Weights')
plt.tight_layout()

In [None]:
# LONG ONLY PORTFOLIO
myret = ret[pc1.nlargest(10).index].mean(1) # equally weighted portfolo from the 10 stocks
myret.cumsum().apply(np.exp).plot(figsize=(15,5), 
                                  grid=True, 
                                  linewidth=3, 
                                  title='PCA Portfolio vs. S&P500')

SP500index['2018':].apply(np.log).diff(1).cumsum().apply(np.exp).plot(figsize=(10,6), grid=True, linewidth=3)
plt.legend(['PCA Selection', 'S&P500'])

plt.tight_layout()

In [None]:
ws = [-1,]*10+[1,]*10
ws

In [None]:
#LONG-SHORT PORTFOLIO
ws = [-1,]*10+[1,]*10
myret = (ret[list(pc1.nsmallest(10).index)+list(pc1.nlargest(10).index)]*ws).mean(1)

myret.cumsum().apply(np.exp).plot(figsize=(15,5), 
                                  grid=True, 
                                  linewidth=3, 
                                  title='PCA Portfolio vs. S&P500')

SP500index['2018':].apply(np.log).diff(1).cumsum().apply(np.exp).plot(figsize=(10,6), grid=True, linewidth=3)
plt.legend(['PCA Selection (Long-Short)', 'S&P500'])

plt.tight_layout()

## Plot all weights

In [None]:
# plot the weights in the PC
weights_df = pd.DataFrame(data = weights*100,index = ret.columns)
weights_df.columns=['Stock Weights (%)']
weights_df.plot.bar(title='PCA portfolio weights',rot =90,fontsize =8, figsize=(18,10));