In [4]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler
import pickle
from datetime import datetime

In [5]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [6]:
df.set_index('timestamp', inplace=True)
df.head()

Unnamed: 0_level_0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [7]:
asset_details = pd.read_csv('data/asset_details.csv')
asset_details.sort_values('Asset_ID')

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin
3,5,1.386294,EOS.IO
5,6,5.894403,Ethereum
4,7,2.079442,Ethereum Classic
11,8,1.098612,IOTA
6,9,2.397895,Litecoin


In [8]:
# we reindex to fill the missing timestamps

In [9]:
binance = df[df['Asset_ID']==0]
print(f"number of rows before reindexing : {binance.shape[0]}")

binance = binance.reindex(range(binance.index[0], binance.index[-1]+60, 60), method='pad')
print(f"number of rows after reindexing : {binance.shape[0]}")

number of rows before reindexing : 1942619
number of rows after reindexing : 1956960


In [10]:
def log_returns(serie, periods=1):
    return np.log(serie).diff(periods)

In [11]:
log_feat=[]

for i in [1,10,100]:

    # compute the log returns 
    log_feat.append(f"log_returns_t_minus_{i}")
    binance[f'log_returns_t_minus_{i}'] = log_returns(binance['Close'], periods=i).fillna(0)
    
    # conmpute the log volumes
    log_feat.append(f"log_volumes_t_minus_{i}")
    binance[f'log_volumes_t_minus_{i}'] = log_returns(binance['Volume'], periods=i).fillna(0)
    
    # conmpute the log counts
    log_feat.append(f"log_counts_t_minus_{i}")
    binance[f'log_counts_t_minus_{i}'] = log_returns(binance['Count'], periods=i).fillna(0)

In [12]:
binance['log_returns'] = log_returns(binance['Close'], periods=1).shift(-16).fillna(0)

In [13]:
binance['Target'].fillna(binance['log_returns'], inplace=True)
binance['Target']

timestamp
1514764860   -0.014399
1514764920   -0.015875
1514764980   -0.015410
1514765040   -0.012524
1514765100   -0.005940
                ...   
1632182160    0.000000
1632182220    0.000000
1632182280    0.000000
1632182340    0.000000
1632182400    0.000000
Name: Target, Length: 1956960, dtype: float64

In [14]:
binance.head(10)

Unnamed: 0_level_0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,log_returns_t_minus_1,log_volumes_t_minus_1,log_counts_t_minus_1,log_returns_t_minus_10,log_volumes_t_minus_10,log_counts_t_minus_10,log_returns_t_minus_100,log_volumes_t_minus_100,log_counts_t_minus_100,log_returns
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000965
1514764920,0,7.0,8.53,8.53,8.5145,8.5145,71.39,8.520215,-0.015875,-0.001819,-0.093411,0.336472,0.0,0.0,0.0,0.0,0.0,0.0,-0.004993
1514764980,0,45.0,8.5065,8.5299,8.4848,8.4848,1546.82,8.501394,-0.01541,-0.003494,3.075799,1.860752,0.0,0.0,0.0,0.0,0.0,0.0,0.002368
1514765040,0,14.0,8.5009,8.5066,8.4744,8.5009,125.8,8.47981,-0.012524,0.001896,-2.509263,-1.167605,0.0,0.0,0.0,0.0,0.0,0.0,-0.002368
1514765100,0,5.0,8.5007,8.5007,8.456,8.456,125.01,8.458435,-0.00594,-0.005296,-0.0063,-1.029619,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1514765160,0,89.0,8.456,8.456,8.3999,8.4,3765.25,8.403468,-0.005455,-0.006645,3.405176,2.879198,0.0,0.0,0.0,0.0,0.0,0.0,-1.2e-05
1514765220,0,20.0,8.4,8.4,8.38,8.3958,827.17,8.390564,-0.000644,-0.0005,-1.515559,-1.492904,0.0,0.0,0.0,0.0,0.0,0.0,0.00299
1514765280,0,25.0,8.38,8.4544,8.38,8.3804,370.04,8.390332,-0.00358,-0.001836,-0.804399,0.223144,0.0,0.0,0.0,0.0,0.0,0.0,-0.00299
1514765340,0,16.0,8.3999,8.4519,8.38,8.38,1428.91,8.400632,-0.005421,-4.8e-05,1.351056,-0.446287,0.0,0.0,0.0,0.0,0.0,0.0,-0.000275
1514765400,0,1.0,8.3932,8.3932,8.3932,8.3932,12.0,8.3932,-0.005123,0.001574,-4.779761,-2.772589,0.0,0.0,0.0,0.0,0.0,0.0,0.000287


In [15]:
binance['_Spread'] = binance['Open'] - binance['Close']
binance['_Max_spread'] = binance['High'] - binance['Low']

In [16]:
feats = log_feat
X = binance[feats]
y = binance.Target

In [17]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

In [18]:
# create the features over last 15 intervals
# for feat in X_scaled.columns:
#     for i in range(1,16):
#         X_scaled[f"{feat} t-{i}"] = X_scaled.shift(i)[feat]

In [19]:
X_scaled = X_scaled.fillna(0)

In [23]:
X_scaled.index = pd.to_datetime(X_scaled.index, unit='s')

In [24]:
X_scaled.head()

Unnamed: 0_level_0,log_returns_t_minus_1,log_volumes_t_minus_1,log_counts_t_minus_1,log_returns_t_minus_10,log_volumes_t_minus_10,log_counts_t_minus_10,log_returns_t_minus_100,log_volumes_t_minus_100,log_counts_t_minus_100
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01 00:01:00,-0.001062,-9.741894e-07,-3e-06,-0.003509,-4e-06,-1.9e-05,-0.011799,-4e-05,-0.000177
2018-01-01 00:02:00,-1.00781,-0.0699236,0.457159,-0.003509,-4e-06,-1.9e-05,-0.011799,-4e-05,-0.000177
2018-01-01 00:03:00,-1.935252,2.302382,2.528186,-0.003509,-4e-06,-1.9e-05,-0.011799,-4e-05,-0.000177
2018-01-01 00:04:00,1.048278,-1.878305,-1.586419,-0.003509,-4e-06,-1.9e-05,-0.011799,-4e-05,-0.000177
2018-01-01 00:05:00,-2.932456,-0.004716536,-1.398939,-0.003509,-4e-06,-1.9e-05,-0.011799,-4e-05,-0.000177


In [25]:
pickle.dump(X_scaled, open( "X_scaled.p", "wb" ) )
pickle.dump(y, open( "y.p", "wb" ) )