In [9]:
import bs4 as bs
import datetime as dt
import os
import pandas as pd
import pandas_datareader.data as web
import pickle
import requests
import statsmodels.api as sm
import numpy as np
from sklearn.decomposition import PCA

#Use beautifulSoup to get the S&P 500 tickers and store it in pickle
def save_sp500_tickers():
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        tickers.append(ticker)
    with open("sp500tickers.pickle", "wb") as f:
        pickle.dump(tickers, f)
    return tickers

def get_data_from_yahoo(reload_sp500=False):
    #Open pickle
    if reload_sp500:
        tickers = save_sp500_tickers()
    else:
        with open("sp500tickers.pickle", "rb") as f:
            tickers = pickle.load(f)
    if not os.path.exists('stock_dfs'):
        os.makedirs('stock_dfs')
    #Read stock data from yahoo
    start = dt.datetime(2012, 1, 1)
    end = dt.datetime(2020,1, 1)
    for ticker in tickers: 
        if '.' in ticker:
            array=ticker.split(".")
            ticker= array[0]
        ticker = ticker.rstrip("\n")
        if ticker == 'CARR' or ticker == 'KR'or ticker == 'LHX'or ticker == 'LKQ' or ticker == 'NOC' or ticker == 'OTIS'or ticker == 'UDR' or ticker == 'UNH'or ticker == 'VZ'or ticker == 'VIAC' or ticker == 'BF':
            continue
        if not os.path.exists('stock_dfs\{}.csv'.format(ticker.rstrip("\n"))):
            df = web.DataReader(ticker.rstrip("\n"), start=start, end=end, data_source='yahoo')
            df.reset_index(inplace=True)
            df.set_index("Date", inplace=True)
            df.to_csv('stock_dfs\{}.csv'.format(ticker.rstrip("\n")))
        else:
            print('Already have {}'.format(ticker))

#Get Adj Close data for each stock and merge the data to one dataframe
def data_Cleaning():
    dataAll = []
    with open("sp500tickers.pickle", "rb") as f:
        tickers = pickle.load(f)
    for ticker in tickers:
        if '.' in ticker:
            array=ticker.split(".")            
            ticker= array[0]
            print(ticker)
        ticker= ticker.rstrip("\n")  
        if ticker == 'CARR' or ticker == 'KR'or ticker == 'LHX'or ticker == 'LKQ' or ticker == 'NOC' or ticker == 'OTIS'or ticker == 'UDR' or ticker == 'UNH'or ticker == 'VZ'or ticker == 'VIAC'or ticker == 'BF':
            continue
        data = pd.read_csv('stock_dfs\{}.csv'.format(ticker))
        if (ticker=='B'):
            print(data.head())
        data = data.set_index('Date')
        data = data[['Adj Close']].dropna()
        data = data.rename({"Adj Close": ticker},axis='columns')
        dataAll.append(data)
    dataAll=pd.concat(dataAll,axis = 1)
    return dataAll

#Get stock returns from given stock prices
def getReturn(price):
    ret = (price - price.shift(1))/price
    ret = ret.drop(ret.index[0])
    ret = ret.fillna(value = 0)
    return ret

#Calculate Zscores givieng a range of returns
def getZscores(return_in_range, num_factor):
    # Sample data for PCA (smooth it using np.log function)
    sample = return_in_range.replace([np.inf, -np.inf], np.nan)
    sample = sample.dropna(axis = 1,thresh = len(sample)-30)
    mean = sample.mean() 
    sample = (sample - mean)/sample.std()# Center it column-wise

    # Fit the PCA model for sample data
    sample = pd.DataFrame(sample).fillna(0)
    model = PCA().fit(sample)
    weights = pd.DataFrame(model.components_)
    # Get the first n_components factors
    factors = np.dot(sample, weights.T)[:,:(num_factor-1)]
    # Add 1's to fit the linear regression (intercept)
    factors = sm.add_constant(factors)
    # Train Ordinary Least Squares linear model for each stock
    OLSmodels = {ticker: sm.OLS(sample[ticker], factors).fit() for ticker in sample.columns}
    # Get the residuals from the linear regression after PCA for each stock
    resids = pd.DataFrame({ticker: model.resid for ticker, model in OLSmodels.items()})
    # Get the Z scores by standarize
    zscores = ((resids - resids.mean()) / resids.std()).iloc[-1] # residuals of the most recent day
    return zscores

#Using slicing windown to get Zscores Array
def zscoresArr(stopTime, dataAll,num_factor):
    zscoresArr = []
    for t in range(stopTime, len(dataAll)-stopTime-1):
        price_in_range = dataAll[t-1:t+stopTime]
        return_in_range = getReturn(price_in_range)
        zscores = getZscores(return_in_range, num_factor)
        zscoresArr.append(zscores.to_frame())
    zscoresRes = pd.concat(zscoresArr,axis=1)     
    return zscoresRes

def backtesting(zscores, price, signals):
    [ss, sb, cs, cb]=signals
    #initialize original stock position
    position = pd.Series(0, index = price.columns) 
    #initialize money, stockValue and total Value (which is the sum of money and stockValue)
    money = pd.Series(0, index = price.index)
    stockValue = pd.Series(0, index = price.index)
    totalValue = pd.Series(0, index = price.index)
    for i in zscores:
        print(i)
        #get the sell positions, if the zscores are higher than start sell signal
        sell = zscores[i].where(zscores[i] >ss, 0)
        # scaled to make sure that the weights add up to -100%
        sell = sell/sell.sum() 
        # selling stock would increase the money 
        money[i] += sum(sell * price.loc[i])
        # update positions
        position = position - sell
        
        #get the sell positions, if the zscores are higher than start sell signal
        buy = zscores[i].where(zscores[i] <sb, 0)
        # scaled to make sure that the weights add up to 100%
        buy = buy/buy.sum() 
        # buying stock would decrease the money
        money[i] -= sum(buy * price.loc[i]) 
        # update positions
        position = position + buy
        
        #get the clear positions, if the zscores are in between the close buy and close sell signals
        clear = zscores[i].where(zscores[i].between(cb,cs), 0)
        clear = clear.where(clear!=0, 1) #build a boolen mask for positions to clear
        positionToClear = position * clear
        #clear the positions, money value will change according to the positions
        money[i] += sum(positionToClear * price.loc[i])
        #update positions
        position = zscores[i].where(zscores[i].between(cb,cs), 0)
        
        #update stockValue and totalValue
        stockValue[i] = sum(position * price.loc[i])
        totalValue[i] = money[i] + stockValue[i] 
    return money, stockValue, totalValue

def find_Sharpe_Ratio(pnl,r):
    ret = getReturn(pnl)
    ret = find_Return(pnl)
    std = ret.std() * math.sqrt(252)
    return (mean - r)/std

def find_Maximum_Drawdown(pnl):
    ret = find_Return(pnl)
    r = ret.add(1).cumprod()
    dd = r.div(r.cummax()).sub(1)
    mdd = dd.min()
    end = dd.argmin()
    start = r.argmax()
    return mdd

def find_Cumulative_Return(pnl):
    return (pnl.iloc[len(pnl.index)-1] - pnl.iloc[0]) / pnl.iloc[0]

def main():
    stopTime = 252 #for slicing window
    num_factor = 10
    ss = 1.25 #start to sell signal
    sb = -1.25 #start to buy signal
    cs = 0.25 #close to sell signal
    cb = -0.25 #close to buy signal
    r = 0.01 #market rate
    signals = [ss, sb, cs, cb]
    
    get_data_from_yahoo()
    dataAll= data_Cleaning()
    zscores = zscoresArr(stopTime, dataAll.fillna(0),num_factor) 
    money, stockValue, totalValue = backtesting(zscores, dataAll.fillna(0), signals)
    #Calculation resultsd
    sharpe_ratio = find_Sharpe_Ratio(totalValue,r)
    maximum_drawdowns = find_Maximum_Drawdown(totalValue)
    cumulative_return = find_Cumulative_Return(totalValue)
    print('Cumulative return:', cumulative_return, 'Sharpe_Ratio:', sharpe_Ratio, 'Maximum_drawdowns: ',maximum_drawdowns)
    
if __name__ == "__main__":
    main()    
    
    
#Reference:
#Reading the yahoo data https://pythonprogramming.net/sp500-company-price-data-python-programming-for-finance/
#PCA strategy https://www.quantconnect.com/tutorials/strategy-library/mean-reversion-statistical-arbitrage-strategy-in-stocks
#Calculation results https://github.com/YuxiLiuAsana/Statistical-Arbitrage-Avellaneda-/blob/master/main.py

Already have MMM
Already have ABT
Already have ABBV
Already have ABMD
Already have ACN
Already have ATVI
Already have ADBE
Already have AMD
Already have AAP
Already have AES
Already have AFL
Already have A
Already have APD
Already have AKAM
Already have ALK
Already have ALB
Already have ARE
Already have ALXN
Already have ALGN
Already have ALLE
Already have LNT
Already have ALL
Already have GOOGL
Already have GOOG
Already have MO
Already have AMZN
Already have AMCR
Already have AEE
Already have AAL
Already have AEP
Already have AXP
Already have AIG
Already have AMT
Already have AWK
Already have AMP
Already have ABC
Already have AME
Already have AMGN
Already have APH
Already have ADI
Already have ANSS
Already have ANTM
Already have AON
Already have AOS
Already have APA
Already have AIV
Already have AAPL
Already have AMAT
Already have APTV
Already have ADM
Already have ANET
Already have AJG
Already have AIZ
Already have T
Already have ATO
Already have ADSK
Already have ADP
Already have AZ

BRK
BF
2014-01-03
2014-01-06
2014-01-07
2014-01-08
2014-01-09
2014-01-10
2014-01-13
2014-01-14
2014-01-15
2014-01-16
2014-01-17
2014-01-21
2014-01-22
2014-01-23
2014-01-24
2014-01-27
2014-01-28
2014-01-29
2014-01-30
2014-01-31
2014-02-03
2014-02-04
2014-02-05
2014-02-06
2014-02-07
2014-02-10
2014-02-11
2014-02-12
2014-02-13
2014-02-14
2014-02-18
2014-02-19
2014-02-20
2014-02-21
2014-02-24
2014-02-25
2014-02-26
2014-02-27
2014-02-28
2014-03-03
2014-03-04
2014-03-05
2014-03-06
2014-03-07
2014-03-10
2014-03-11
2014-03-12
2014-03-13
2014-03-14
2014-03-17
2014-03-18
2014-03-19
2014-03-20
2014-03-21
2014-03-24
2014-03-25
2014-03-26
2014-03-27
2014-03-28
2014-03-31
2014-04-01
2014-04-02
2014-04-03
2014-04-04
2014-04-07
2014-04-08
2014-04-09
2014-04-10
2014-04-11
2014-04-14
2014-04-15
2014-04-16
2014-04-17
2014-04-21
2014-04-22
2014-04-23
2014-04-24
2014-04-25
2014-04-28
2014-04-29
2014-04-30
2014-05-01
2014-05-02
2014-05-05
2014-05-06
2014-05-07
2014-05-08
2014-05-09
2014-05-12
2014-05-13
201