In [1]:
import numpy as np
from hmmlearn import hmm
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.metrics.pairwise import pairwise_distances_argmin
import warnings

In [2]:
def expand(a, b):
    d = (b - a) * 0.05
    return a-d, b+d

In [3]:
warnings.filterwarnings("ignore")   # hmmlearn(0.2.0) < sklearn(0.18)

In [4]:
# 加载数据
# 0日期  1开盘  2最高  3最低  4收盘  5成交量  6成交额
# delimiter: 指定分隔符
# skiprows：给定前面多少行数据不进行处理
# usecols: 给定第4 5 6 2 3这五列数据进行读入(下标从0开始)
x = np.loadtxt('SH600000.txt', delimiter='\t', skiprows=2, usecols=(4, 5, 6, 2, 3))
x = x[:-1, :] # 最后一天的数据不考虑
close_price = x[:, 0] # 收盘价
volumn = x[:, 1] # 成交量
amount = x[:, 2] # 成交额
amplitude_price = x[:, 3] - x[:, 4] # 每天的最高价与最低价的差
diff_price = np.diff(close_price)   # 涨跌值(每天相对于昨天的涨跌幅)
volumn = volumn[1:]                 # 成交量(今天的成交量)
amount = amount[1:]                 # 成交额(今天的成交额度)
amplitude_price = amplitude_price[1:]   # 每日振幅(今天的振幅)

# 相当于整个数据相当于一个序列，序列中的每个样本具有四个特征
sample = np.column_stack((volumn, amount, amplitude_price, diff_price))    # 观测值
print("样本数目:%d, 每个样本的特征数目:%d" % sample.shape)
sample

样本数目:1509, 每个样本的特征数目:4


array([[  1.15147943e+08,   2.43689088e+09,   4.70000000e-01,
          9.00000000e-02],
       [  9.67825750e+07,   2.03417408e+09,   2.40000000e-01,
         -2.40000000e-01],
       [  8.52360720e+07,   1.76180096e+09,   4.40000000e-01,
         -2.80000000e-01],
       ..., 
       [  1.75381840e+07,   3.13324800e+08,   2.90000000e-01,
          2.10000000e-01],
       [  3.56315260e+07,   6.50177344e+08,   4.80000000e-01,
          3.40000000e-01],
       [  1.83124240e+07,   3.33790688e+08,   2.20000000e-01,
         -1.00000000e-01]])

In [5]:
# 模型构建
## 给定隐特征的数目
n = 5
model = hmm.GaussianHMM(n_components=n, random_state=28)
model.fit(sample)
y = model.predict_proba(sample) # 获取预测的概率
print(y)
print(model.predict(sample)) # 获取预测状态值

[[  9.95659799e-001   1.16500509e-170   5.13179027e-062   1.02641753e-148
    4.34020115e-003]
 [  9.48398662e-001   5.08361628e-018   8.70983903e-009   1.29224878e-021
    5.16013293e-002]
 [  9.85748919e-001   4.49183069e-017   6.69011172e-010   3.53339779e-020
    1.42510807e-002]
 ..., 
 [  1.97151317e-001   8.02842998e-001   3.60438311e-018   1.30219717e-033
    5.68494398e-006]
 [  9.99980721e-001   1.70552563e-005   2.76321371e-015   1.42522588e-027
    2.22392552e-006]
 [  1.54824528e-002   9.84514753e-001   8.03209211e-012   1.44326758e-019
    2.79400112e-006]]
[0 0 0 ..., 1 0 1]


In [6]:
# 预测部分数据
test_sample = sample[:2].reshape((-1, sample.shape[1]))
print("部分数据的样本值:\n{}".format(test_sample))
print(model.decode(test_sample))
print(model.predict(test_sample))

部分数据的样本值:
[[  1.15147943e+08   2.43689088e+09   4.70000000e-01   9.00000000e-02]
 [  9.67825750e+07   2.03417408e+09   2.40000000e-01  -2.40000000e-01]]
(-87.85917503446157, array([0, 0]))
[0 0]


In [7]:
# 画图
row_num = int(np.ceil((n+3)/3.0))
t = np.arange(len(diff_price))
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(row_num * 3, 9), facecolor='w')
plt.subplot(row_num, 3, 1)
plt.plot(t, diff_price, 'r-')
plt.grid(True)
plt.title(u'涨跌幅')
plt.subplot(row_num, 3, 2)
plt.plot(t, volumn, 'g-')
plt.grid(True)
plt.title(u'交易量')

# 产生一个颜色
clrs = plt.cm.terrain(np.linspace(0, 0.8, n))
plt.subplot(row_num, 3, 3)
for i, clr in enumerate(clrs):
    # 画到一张图中
    plt.plot(t, y[:, i], '-', color=clr, alpha=0.7)
plt.title(u'所有组分')
plt.grid(True)

# 分开画
for i, clr in enumerate(clrs):
    plt.subplot(row_num, 3, i+4)
    plt.plot(t, y[:, i], '-', color=clr)
    plt.title(u'组分%d' % (i+1))
    plt.grid(True)
plt.suptitle(u'SH600000股票：GaussianHMM分解隐变量', fontsize=18)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
# plt.show()
plt.savefig('hmm.png')

In [8]:
# 模型保存
# 保存方式一：保存模型
from sklearn.externals import joblib
joblib.dump(model, './hmm_{}.m'.format(n))

['./hmm_5.m']

In [9]:
# 模型保存
# 保存方式二：保存隐状态(预测值)
import pickle
states = model.predict(sample)
pickle.dump(states, open('./hmm_states_{}.pkl'.format(n), 'wb'))