In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import pickle
import numpy as np

# Choose a coin

In [3]:
coin = 'BLK'

#### reading the data 

In [4]:
### all you have to do is the choose whatever csv you want to train the model on in this case it's BLK.csv
data = pd.read_csv('hist_data/{}.csv'.format(coin), parse_dates=True)

#### converting the column names to lower case

In [5]:
data.columns = data.columns.str.lower()

#### converting timestamp and created to datetime

In [6]:
data['timestamp'] = pd.to_datetime(data.timestamp)
data['created'] = pd.to_datetime(data.created)

In [7]:
### setting time stamp as the index  
data.set_index('timestamp', inplace=True)

In [8]:
### sampling one point every 15 min (so it's 15 min time frame)
data = data.resample('15T').pad().copy()

In [9]:
### creating new feature high - low
data['hl_diff'] = data.high - data.low

In [10]:
### shifting the ask one cell down to calculate the response var
data['next'] = data.ask.shift(1)

In [11]:
data['change'] = (data.next - data.ask)

In [12]:
def up_down(row):
    """returns if the next movement or up or down"""
    if row > 0:
        return 'up'
    elif row < 0:
        return 'down'
    else:
        return np.nan    

In [13]:
## creating the binary responce variable (up, down)
data['up_down'] = data.change.apply(up_down)

### SMA

In [14]:
### creating the SMAs
data['sma_5'] = data['ask'].rolling(5).mean()
data['sma_10'] = data['ask'].rolling(10).mean()
data['sma_20'] = data['ask'].rolling(20).mean()

In [15]:
### droping missing data 
data = data.dropna()

In [16]:
### creating Percentage change for the moving averages 
data['pc_ch_5'] = data.sma_5.pct_change()
data['pc_ch_10'] = data.sma_10.pct_change()
data['pc_ch_20'] = data.sma_20.pct_change()

In [17]:
## the distance between the price and the moving average  
data['sma5_ask_diff'] = (data.sma_5  - data.ask)
data['sma10_ask_diff'] = (data.sma_10  - data.ask)
data['sma20_ask_diff'] = (data.sma_20  - data.ask)

In [18]:
data['spread'] = data.high - data.low

#### cleaning

In [19]:
to_drop = ['prevday','opensellorders','last','created','bid','openbuyorders','marketname','low','high','basevolume']
data.drop(to_drop,axis=1,inplace=True)
data.dropna(inplace=True)
data['up_down'] = data.up_down.map({'up':1,'down':0})

In [20]:
data.dropna(inplace=True)
y = data.up_down.values

In [21]:
data.drop('up_down',axis=1,inplace=True)
X = data.values

#### ML

In [22]:
from sklearn.ensemble import RandomForestClassifier
model3 = RandomForestClassifier(n_estimators=1000)
model3.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

### SCORING 

In [23]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
model3.fit(X_train,y_train)
pred = model3.predict(X_test)
accuracy_score(pred,y_test)

0.84771573604060912

### storing the model 

In [24]:
with open('models/{}.pickle'.format(coin),'wb') as mod:
    pickle.dump(model3, mod)