In [39]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler
import pickle

In [16]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [17]:
df.set_index('timestamp', inplace=True)
df.head()

Unnamed: 0_level_0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [18]:
asset_details = pd.read_csv('data/asset_details.csv')
asset_details.sort_values('Asset_ID')

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin
3,5,1.386294,EOS.IO
5,6,5.894403,Ethereum
4,7,2.079442,Ethereum Classic
11,8,1.098612,IOTA
6,9,2.397895,Litecoin


In [19]:
# we reindex to fill the missing timestamps

In [20]:
binance = df[df['Asset_ID']==0]
print(f"number of rows before reindexing : {binance.shape[0]}")
binance = binance.reindex(range(binance.index[0], binance.index[-1]+60, 60), method='pad')
print(f"number of rows after reindexing : {binance.shape[0]}")

number of rows before reindexing : 1942619
number of rows after reindexing : 1956960


In [21]:
def log_returns(serie):
    return np.log(serie).diff(periods=15)

In [22]:
binance['log_returns'] = log_returns(binance['Close']).shift(-16).fillna(0)
binance[['Target','log_returns']]

Unnamed: 0_level_0,Target,log_returns
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
1514764860,-0.014399,-0.014504
1514764920,-0.015875,-0.016003
1514764980,-0.015410,-0.015530
1514765040,-0.012524,-0.012603
1514765100,-0.005940,-0.005958
...,...,...
1632182160,,0.000000
1632182220,,0.000000
1632182280,,0.000000
1632182340,,0.000000


In [23]:
binance['Target'].fillna(binance['log_returns'], inplace=True)
binance['Target']

timestamp
1514764860   -0.014399
1514764920   -0.015875
1514764980   -0.015410
1514765040   -0.012524
1514765100   -0.005940
                ...   
1632182160    0.000000
1632182220    0.000000
1632182280    0.000000
1632182340    0.000000
1632182400    0.000000
Name: Target, Length: 1956960, dtype: float64

In [24]:
binance.head()

Unnamed: 0_level_0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,log_returns
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399,-0.014504
1514764920,0,7.0,8.53,8.53,8.5145,8.5145,71.39,8.520215,-0.015875,-0.016003
1514764980,0,45.0,8.5065,8.5299,8.4848,8.4848,1546.82,8.501394,-0.01541,-0.01553
1514765040,0,14.0,8.5009,8.5066,8.4744,8.5009,125.8,8.47981,-0.012524,-0.012603
1514765100,0,5.0,8.5007,8.5007,8.456,8.456,125.01,8.458435,-0.00594,-0.005958


In [28]:
binance['_Spread'] = binance['Open'] - binance['Close']
binance['_Max_spread'] = binance['High'] - binance['Low']

In [41]:
X = binance[['Count','Volume','_Spread','_Max_spread']]
y = binance.Target

In [42]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

In [46]:
X_scaled.head()

Unnamed: 0_level_0,Count,Volume,_Spread,_Max_spread
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1514764860,-0.413922,-0.6128,-0.000519,-0.333829
1514764920,-0.409329,-0.615214,0.050954,-0.308071
1514764980,-0.322069,-0.105676,0.071543,-0.258881
1514765040,-0.393255,-0.596424,-0.000519,-0.280319
1514765100,-0.413922,-0.596697,0.147923,-0.259546


In [47]:
pickle.dump(X_scaled, open( "X_scaled.pickle", "wb" ) )