In [1]:
# Statistical Arbitrage - Pair Trading Strategy
import pandas as pd
import numpy as np
from binance.client import Client # pip install python-binance
from binance.websockets import BinanceSocketManager
from twisted.internet import reactor
import math
import os.path
import time
from datetime import timedelta, datetime
from dateutil import parser
from tqdm import tqdm_notebook #(Optional, used for progress-bars)
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import coint
from sklearn.model_selection import train_test_split

In [2]:
# client login to binance.us
binance_client = Client(os.getenv('binance_api'), os.getenv('binance_secret'))
binsizes = {"1m": 1, "5m": 5, "1h": 60, "1d": 1440}
batch_size = 750

In [None]:
def minutes_of_new_data(symbol, kline_size, data, source):
    if len(data) > 0:  old = parser.parse(data["timestamp"].iloc[-1])
    elif source == "binance": old = datetime.strptime('1 Jan 2017', '%d %b %Y')
    if source == "binance": new = pd.to_datetime(binance_client.get_klines(symbol=symbol, interval=kline_size)[-1][0], unit='ms')
    return old, new

def get_all_binance(symbol, kline_size, save = False):
    filename = '%s-%s-data.csv' % (symbol, kline_size)
    if os.path.isfile(filename): data_df= pd.read_csv(filename)
    else: data_df = pd.DataFrame()
    oldest_point, newest_point = minutes_of_new_data(symbol, kline_size, data_df, source = "binance")
    delta_min = (newest_point - oldest_point).total_seconds()/60
    available_data = math.ceil(delta_min/binsizes[kline_size])
    if oldest_point == datetime.strptime('1 Jan 2017', '%d %b %Y'): print('Downloading all available %s data for %s. Be patient..!' % (kline_size, symbol))
    else: print('Downloading %d minutes of new data available for %s, i.e. %d instances of %s data.' % (delta_min, symbol, available_data, kline_size))
    klines = binance_client.get_historical_klines(symbol, kline_size, oldest_point.strftime("%d %b %Y %H:%M:%S"), newest_point.strftime("%d %b %Y %H:%M:%S"))
    for line in klines:
        del line[5:]
    data = pd.DataFrame(klines, columns = ['timestamp', 'open', 'high', 'low', 'close'])
    data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')
    data['symbol'] = symbol
    if len(data_df) > 0:
        temp_df = pd.DataFrame(data)
        data_df = data_df.append(temp_df)
    else: data_df = data
    data_df.set_index('timestamp', inplace=True)
    if save: data_df.to_csv(filename)
    print('All caught up..!')
    return data_df

In [None]:
# valid intervals - 1m, 3m, 5m, 15m, 30m, 1h, 2h, 4h, 6h, 8h, 12h, 1d, 3d, 1w, 1M
# saves all files in /data directory
binance_symbols = []
tickers = binance_client.get_all_tickers()
for item in tickers:
        binance_symbols.append(item['symbol'])
for symbol in binance_symbols:
    get_all_binance(symbol, '1d', save = True)

In [None]:
globbed_files = glob.glob("*.csv") #creates a list of all csv files

data = [] # pd.concat takes a list of dataframes as an agrument
for csv in globbed_files:
    frame = pd.read_csv(csv)
    frame['symbol'] = os.path.basename(csv)
    data.append(frame)

combined_data = pd.concat(data, ignore_index=True) #dont want pandas to try an align row indexes
combined_data.to_csv('combined_data.csv', index=False, encoding='utf-8-sig') #export to csv

In [3]:
df = pd.read_csv('combined_data.csv')
df['symbol'] = [x.split("-")[0] for x in df['symbol']]
df.set_index('timestamp', inplace=True)
df = df[['symbol', 'close']]
df = df[df['symbol'].str.contains("BTC")]
df

FileNotFoundError: [Errno 2] No such file or directory: 'combined_data.csv'

In [None]:
start_date = '2020-06-01'
end_date = '2021-02-01'
mask = (df.index > start_date) & (df.index <= end_date)
df = df.loc[mask]
df

In [None]:
limit = len(df.index)
counts = df['symbol'].value_counts()
df = df[~df['symbol'].isin(counts[counts < limit].index)]
df = df[df['close'].notna()]
df

In [None]:
df = df.pivot_table(index='timestamp', columns='symbol', values='close')
df.columns.name = None
df = df[df['ETHBTC'].notna()]
df.head()

In [None]:
# Pearson correlation to get the basic idea about the relationship
fig, ax = plt.subplots(figsize=(10,7))
sns.heatmap(df.pct_change().corr(method ='pearson'), ax=ax, cmap='coolwarm', annot=True, fmt=".2f") #spearman
ax.set_title('Assets Correlation Matrix')

In [None]:
 corr_df = df.corr(method='pearson')
 #reset symbol as index (rather than 0-X)
 corr_df.head().reset_index()

In [None]:
 #take the bottom triangle since it repeats itself
 mask = np.zeros_like(corr_df)
 mask[np.triu_indices_from(mask)] = True
 #generate plot
 sns.heatmap(corr_df, cmap='RdYlGn', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=2.5)
 plt.yticks(rotation=0) 
 plt.xticks(rotation=90) 
 plt.show()