In [1]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler
import pickle
from datetime import datetime

In [2]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [3]:
df.set_index('timestamp', inplace=True)
df.head()

Unnamed: 0_level_0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [4]:
asset_details = pd.read_csv('data/asset_details.csv')
asset_details.sort_values('Asset_ID')

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin
3,5,1.386294,EOS.IO
5,6,5.894403,Ethereum
4,7,2.079442,Ethereum Classic
11,8,1.098612,IOTA
6,9,2.397895,Litecoin


In [5]:
# we reindex to fill the missing timestamps

In [6]:
binance = df[df['Asset_ID']==0]
print(f"number of rows before reindexing : {binance.shape[0]}")

binance = binance.reindex(range(binance.index[0], binance.index[-1]+60, 60), method='pad')
print(f"number of rows after reindexing : {binance.shape[0]}")

number of rows before reindexing : 1942619
number of rows after reindexing : 1956960


In [20]:
# we compute the log returns

In [7]:
def log_returns(serie, periods=1):
    return np.log(serie).diff(periods)

In [8]:
log_feat=[]

for i in [1,10,100]:

    # compute the log returns 
    log_feat.append(f"log_returns_t_minus_{i}")
    binance[f'log_returns_t_minus_{i}'] = log_returns(binance['Close'], periods=i).fillna(0)
    
    # conmpute the log volumes
    log_feat.append(f"log_volumes_t_minus_{i}")
    binance[f'log_volumes_t_minus_{i}'] = log_returns(binance['Volume'], periods=i).fillna(0)
    
    # conmpute the log counts
    log_feat.append(f"log_counts_t_minus_{i}")
    binance[f'log_counts_t_minus_{i}'] = log_returns(binance['Count'], periods=i).fillna(0)

In [9]:
binance['log_returns'] = log_returns(binance['Close'], periods=1).shift(-16).fillna(0)

In [21]:
# we replace the missing targets with logs returns

In [19]:
binance['Target'].fillna(binance['log_returns'], inplace=True)

In [12]:
binance['_Spread'] = binance['Open'] - binance['Close']
binance['_Max_spread'] = binance['High'] - binance['Low']

In [13]:
feats = log_feat
X = binance[feats]
y = binance.Target

In [14]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

In [15]:
# create the features over last 15 intervals
# for feat in X_scaled.columns:
#     for i in range(1,16):
#         X_scaled[f"{feat} t-{i}"] = X_scaled.shift(i)[feat]

In [16]:
X_scaled = X_scaled.fillna(0)

In [17]:
X_scaled.index = pd.to_datetime(X_scaled.index, unit='s')

In [18]:
X_scaled.head()

Unnamed: 0_level_0,log_returns_t_minus_1,log_volumes_t_minus_1,log_counts_t_minus_1,log_returns_t_minus_10,log_volumes_t_minus_10,log_counts_t_minus_10,log_returns_t_minus_100,log_volumes_t_minus_100,log_counts_t_minus_100
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01 00:01:00,-0.001062,-9.741894e-07,-3e-06,-0.003509,-4e-06,-1.9e-05,-0.011799,-4e-05,-0.000177
2018-01-01 00:02:00,-1.00781,-0.0699236,0.457159,-0.003509,-4e-06,-1.9e-05,-0.011799,-4e-05,-0.000177
2018-01-01 00:03:00,-1.935252,2.302382,2.528186,-0.003509,-4e-06,-1.9e-05,-0.011799,-4e-05,-0.000177
2018-01-01 00:04:00,1.048278,-1.878305,-1.586419,-0.003509,-4e-06,-1.9e-05,-0.011799,-4e-05,-0.000177
2018-01-01 00:05:00,-2.932456,-0.004716536,-1.398939,-0.003509,-4e-06,-1.9e-05,-0.011799,-4e-05,-0.000177


In [25]:
pickle.dump(X_scaled, open( "X_scaled.p", "wb" ) )
pickle.dump(y, open( "y.p", "wb" ) )