In [1]:
import torch
import numpy as np
import pandas as pd
from os import listdir
import random
random.seed()   
import sklearn.mixture as mix
import talib
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns


mypath = '/media/sweerts/Data/trading/cash_bot/data/'
extension = 'h5'
onlyfiles = [f for f in listdir(mypath) if f.endswith('.' + extension)]

# Prepare training data set

In [2]:
masterdata = pd.read_hdf('/media/sweerts/Data/trading/cash_bot/data/' + onlyfiles[0], mode='r')
masterdata['RSI'] = talib.RSI(masterdata['close'],140).astype(float)
masterdata = masterdata.dropna()
for file in onlyfiles[1:]:
    data = pd.read_hdf('/media/sweerts/Data/trading/cash_bot/data/' + file, mode='r')
    data['RSI'] = talib.RSI(data['close'],140).astype(float)
    data = data.dropna()
    masterdata = masterdata.append(data)
    


# Calculate optimal number of components (hidden states)

In [None]:
X =  masterdata['RSI'].values.reshape(-1, 1)
n_estimators = np.arange(1,100)
clfs = [mix.GaussianMixture(n_components=n,covariance_type="full",n_init=10,random_state=0).fit(X) for n in n_estimators]
bics = [clf.bic(X) for clf in clfs]
aics = [clf.aic(X) for clf in clfs]

plt.plot(n_estimators, bics, label='BIC')
plt.plot(n_estimators, aics, label='AIC')
plt.legend()


In [None]:
print(bics)

# train model with optimal number of hidden states

In [8]:
X =  masterdata['RSI'].values.reshape(-1, 1)
model = mix.GaussianMixture(n_components=20,
                covariance_type="full",
                n_init=10,
                random_state=0).fit(X)
hidden_states = model.predict(X)
########################################################################### HMM #####################################
masterdata['HMM_RSI'] = hidden_states

In [9]:
print(model.converged_)

True


In [10]:
from sklearn.externals import joblib
# now you can save it to a file
joblib.dump(model, 'GMM.pkl') 
# and later you can load it
model = joblib.load('GMM.pkl')

# Test model

In [27]:
from sklearn.externals import joblib
masterdata = pd.read_hdf('/media/sweerts/Data/trading/cash_bot/data/' + onlyfiles[10], mode='r')
masterdata['RSI'] = talib.RSI(masterdata['close'],140).astype(float)
masterdata = masterdata.dropna()
X =  masterdata['RSI'].values.reshape(-1, 1)
model = joblib.load('GMM.pkl')
hidden_states1 = model.predict(X)
hidden_states2 = model.predict(X)
hidden_states3 = model.predict(X)
print(sum(hidden_states1),sum(hidden_states2),sum(hidden_states3))

86094 86094 86094


In [28]:
print(sum((hidden_states1==hidden_states2)-1))

0
