# Implementing a Statistical Arbitrage Strategy

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import sklearn
import numpy as np
from Utility import Utility

Read in and clean dataset

In [2]:
# Read in prices and largest-cap tokens data
tokens_price = pd.read_csv('coin_all_prices_full.csv')
tokens_largest_cap = pd.read_csv('coin_universe_150K_40.csv')

# Convert the column 'startTime' to datetime object
tokens_price['startTime'] = pd.to_datetime(tokens_price['startTime'])
tokens_largest_cap['startTime'] = pd.to_datetime(tokens_largest_cap['startTime'])

# Set the 'startTime' column as the index
tokens_price.set_index('startTime', inplace=True)
tokens_largest_cap.set_index('startTime', inplace=True)


Pick a start time and window

In [3]:
# Pick a start time any time around 2021
start_time = datetime.strptime('2021-03-08 05:00:00+00:00', '%Y-%m-%d %H:%M:%S%z')
M = 240
tokens_price_window = Utility.select_window(tokens_price, M, start_time)
tokens_largest_cap_window = Utility.select_window(tokens_largest_cap, M, start_time)

Locate common tokens

In [4]:
common_tokens = Utility.find_tokens(tokens_largest_cap_window, M)
common_tokens

['BTC',
 'SUSHI',
 'BADGER',
 'ALPHA',
 'TOMO',
 'ASD',
 'BNB',
 '1INCH',
 'RSR',
 'RAY',
 'MATIC',
 'PERP',
 'CHZ',
 'YFI',
 'SXP',
 'CEL',
 'AAVE',
 'DOGE',
 'XRP',
 'UNI',
 'LINK',
 'LTC',
 'BCH',
 'SRM',
 'TRX',
 'SOL',
 'FTT',
 'BAND',
 'MKR',
 'SNX',
 'LINA',
 'BNT',
 'GRT',
 'FTM']

In [5]:
tokens_price_window = tokens_price_window[common_tokens]
tokens_price_window.fillna(method='ffill', axis=0, inplace=True)
tokens_price_window.head()

Unnamed: 0_level_0,BTC,SUSHI,BADGER,ALPHA,TOMO,ASD,BNB,1INCH,RSR,RAY,...,TRX,SOL,FTT,BAND,MKR,SNX,LINA,BNT,GRT,FTM
startTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-03-08 05:00:00+00:00,50460,17.4,49.205,1.63435,2.47415,0.74042,237.069,3.9312,0.073,8.4512,...,0.051125,13.4825,30.511,13.292,2204.5,21.527,0.07441,8.389,1.85045,0.4404
2021-03-08 06:00:00+00:00,50797,17.5227,50.025,1.65935,2.4901,0.73912,239.902,4.051,0.07163,8.7161,...,0.051537,13.53,31.082,13.615,2229.5,21.756,0.07472,8.534,1.874,0.4489
2021-03-08 07:00:00+00:00,50306,17.2101,49.955,1.61905,2.48835,0.73927,236.233,3.9524,0.06933,8.2373,...,0.051182,13.425,30.701,13.52,2191.5,21.4,0.073545,8.384,1.84175,0.4407
2021-03-08 08:00:00+00:00,49777,17.0,48.215,1.57635,2.37995,0.74007,232.502,3.9316,0.068825,8.2099,...,0.050715,13.32,30.348,13.331,2155.5,21.1535,0.07333,8.239,1.839,0.43125
2021-03-08 09:00:00+00:00,49893,16.9939,48.0,1.58,2.4275,0.73531,231.794,3.93,0.070155,8.128,...,0.0508,13.32,30.229,13.662,2123.0,20.92,0.073195,8.192,1.8224,0.4328


In [6]:
hourly_returns = tokens_price_window.pct_change()
hourly_returns.dropna(inplace=True)

### Compute factor returns of the two risk factors at time t

In [7]:
principal_components, explained_variance, eigenportfolios = Utility.principal_component_analysis(hourly_returns, n_components=2)
factors_return = Utility.calculate_factor_returns(hourly_returns, eigenportfolios)
factors_return

Unnamed: 0_level_0,PC1,PC2
startTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-03-08 06:00:00+00:00,-0.067624,-0.018629
2021-03-08 07:00:00+00:00,0.099356,-0.008709
2021-03-08 08:00:00+00:00,0.073174,0.013674
2021-03-08 09:00:00+00:00,-0.014663,0.047207
2021-03-08 10:00:00+00:00,0.026750,-0.001947
...,...,...
2021-03-18 00:00:00+00:00,-0.027077,0.018055
2021-03-18 01:00:00+00:00,0.004868,-0.020612
2021-03-18 02:00:00+00:00,-0.030411,0.015212
2021-03-18 03:00:00+00:00,-0.000274,0.028827
