# Preprocessing

We will need to preprocess obtained data so that it can be used for our purposes.

In [1]:
import numpy as np
import pandas as pd
from datetime import date
from binance.client import Client

def get_data(asset_list = []):
    """ Obtains historic data from a list of assets using Binance's API

    Parameters
    ----------
    asset_list : list
        List of assets to retrieve information from

    Returns
    -------
    Dataframe
        A dataframe containing OHLV data from provided assets
    """
    client = Client()
    info = client.get_all_tickers()
    
    # Time frame
    today = date.today()
    yearago = today.replace(year = today.year -1).strftime("%Y.%m.%d")
    today = today.strftime("%Y.%m.%d")
    timeframe="1d"

    # Iterate for each asset
    data = {}
    for tick in info:
        asset = tick["symbol"]
        # We will filter the assets to work with
        if asset in asset_list:
            data[asset] = client.get_historical_klines(asset, timeframe, yearago, today)
            
    # Append all dataframes
    df = pd.DataFrame(columns=["Asset","Open time","Open","High","Low","Close","Volume", "Closing time","Quote asset vol", "Num traders", "Taker buy base asset vol", "Taker buy quote asset vol","To be ignored"])
    for asset in asset_list:
        df_tmp = pd.DataFrame(data[asset], columns=["Open time","Open","High","Low","Close","Volume", "Closing time","Quote asset vol", "Num traders", "Taker buy base asset vol", "Taker buy quote asset vol","To be ignored"])
        df_tmp["Asset"] = asset
        df = df.append(df_tmp)
            
    return df

In [2]:
asset_list = ['BNBUSDT','BTCUSDT','ETHUSDT','SOLUSDT','ADAUSDT','XRPUSDT','DOTUSDT','DOGEUSDT']
df = get_data(asset_list)

In [3]:
df

Unnamed: 0,Asset,Open time,Open,High,Low,Close,Volume,Closing time,Quote asset vol,Num traders,Taker buy base asset vol,Taker buy quote asset vol,To be ignored
0,BNBUSDT,1609977600000,42.24400000,44.90000000,41.36600000,43.57280000,5026746.66900000,1610063999999,216717611.76971210,523288,2535285.66800000,109380395.08584490,0
1,BNBUSDT,1610064000000,43.57280000,43.72200000,40.23130000,42.35600000,3548923.78800000,1610150399999,149951576.85134530,411558,1804018.63200000,76246982.43869240,0
2,BNBUSDT,1610150400000,42.34500000,44.05520000,41.50000000,43.84790000,2720363.63600000,1610236799999,116290473.53861750,294683,1458819.25300000,62429021.61184670,0
3,BNBUSDT,1610236800000,43.84790000,45.16200000,40.00000000,42.40310000,4277406.29000000,1610323199999,185165251.13206460,431771,2147084.16200000,93038871.70609660,0
4,BNBUSDT,1610323200000,42.40330000,42.50940000,35.03740000,38.16740000,6332801.05500000,1610409599999,243017347.96947130,664128,3174405.76900000,121735208.64152900,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,DOGEUSDT,1641168000000,0.17450000,0.17450000,0.16740000,0.17030000,460985351.00000000,1641254399999,78769279.94100000,139249,236643465.00000000,40454695.51230000,0
362,DOGEUSDT,1641254400000,0.17030000,0.17260000,0.16650000,0.16840000,496439208.00000000,1641340799999,84445791.17980000,127545,258358537.00000000,43963449.95720000,0
363,DOGEUSDT,1641340800000,0.16850000,0.17100000,0.14720000,0.15900000,1084632258.00000000,1641427199999,175586840.15730000,307998,519468771.00000000,84347970.51660000,0
364,DOGEUSDT,1641427200000,0.15910000,0.16210000,0.15390000,0.15990000,705767171.00000000,1641513599999,111252225.62060000,163053,331392118.00000000,52235681.79080000,0


We would like to extract the mean estimator for the return of investment between intervals (1 day ahead) so we will compute the mean ($\mu$) for each asset series taking the value at closing time as the reference for our prices. Equally, we would like to better understand how different assets correlate their behaviour so the covariance between their values will also be considered ($\sigma$).

In [4]:
import numpy as np

mu = {}
return_list = []
for asset in asset_list:
    num_list = np.array(df[df["Asset"] == asset]["Close"].astype("float"))
    # Sign will be used to indicate the value gradient direction
    returns = (num_list[1:]/num_list[:-1])-1
    mu[asset] = returns.mean()
    return_list.append(returns)
    
sigma = np.cov(np.vstack(return_list))

In [5]:
mu

{'BNBUSDT': 0.009208198464586821,
 'BTCUSDT': 0.0010059578934137013,
 'ETHUSDT': 0.004116403595199305,
 'SOLUSDT': 0.014576471802647015,
 'ADAUSDT': 0.006005492663508319,
 'XRPUSDT': 0.005268521112111553,
 'DOTUSDT': 0.005562786527524439,
 'DOGEUSDT': 0.019137689341795463}

In [6]:
sigma

array([[0.00610019, 0.00199826, 0.0026167 , 0.00321881, 0.00271861,
        0.00300094, 0.00331115, 0.00220872],
       [0.00199826, 0.00174714, 0.00180545, 0.00148052, 0.00163246,
        0.00184257, 0.00220861, 0.00268543],
       [0.0026167 , 0.00180545, 0.00301738, 0.00235438, 0.00223208,
        0.00244429, 0.00311793, 0.00299922],
       [0.00321881, 0.00148052, 0.00235438, 0.00692011, 0.00232865,
        0.0023155 , 0.0027975 , 0.00224842],
       [0.00271861, 0.00163246, 0.00223208, 0.00232865, 0.00445704,
        0.00266393, 0.00322211, 0.0033776 ],
       [0.00300094, 0.00184257, 0.00244429, 0.0023155 , 0.00266393,
        0.00629244, 0.00320017, 0.00266651],
       [0.00331115, 0.00220861, 0.00311793, 0.0027975 , 0.00322211,
        0.00320017, 0.00595016, 0.00349446],
       [0.00220872, 0.00268543, 0.00299922, 0.00224842, 0.0033776 ,
        0.00266651, 0.00349446, 0.0538281 ]])

There we have the expected results on investing on each cryptocurrency by itself and the mutual effect when investing in more than one of them.