In [1]:
from learn import em_learn, svd_learn_new
from data import *
import numpy as np


UndefVarError: UndefVarError: `from` not defined

In [33]:
def svd_learn(sample, n, L=None, verbose=None, stats={}):
    Os = np.moveaxis(sample.all_trail_probs(), 1, 0)

    svds = [np.linalg.svd(Os[j], full_matrices=True) for j in range(n)]

    if verbose:
        for i, (_, s, _) in enumerate(svds):
            print(f"{i}: {s[:L+1]} ...")

    Ps_ = np.zeros((n, L, n))
    Qs_ = np.zeros((n, L, n))
    for j, (u, s, vh) in enumerate(svds):
        Ps_[j, 0 : min(n, L), :] = u[:, 0:L].T
        Qs_[j, 0 : min(n, L), :] = (np.diag(s) @ (vh))[0:L, :]

    A = np.zeros((2 * n * L, n**2))
    for j in range(n):
        A[L * j : L * (j + 1), n * j : n * (j + 1)] = Ps_[j]
        A[L * (n + j) : L * (n + j + 1), j + n * (np.arange(n))] = -Qs_[j]

    _, s, vh = np.linalg.svd(A.T, full_matrices=True)
    small = list(s < 1e-5)
    if True in small:
        fst = small.index(True)
        if verbose:
            print(2 * L * n - fst, L, s[[fst - 1, fst]])
    B = vh[-L:]
    Bre = np.moveaxis(B.reshape((L, L, 2 * n), order="F"), -1, 0)
    Ys_ = Bre[0:n]
    Zs_ = Bre[n : 2 * n]

    Xs = [
        np.linalg.pinv(Zs_[j] @ Ys_[j].T) @ (Zs_[j + 1] @ Ys_[j + 1].T)
        for j in range(n - 1)
    ]
    X = np.sum(Xs, axis=0)
    _, R_ = np.linalg.eig(X)
    d, _, _, _ = np.linalg.lstsq(
        (R_.T @ Ys_[0] @ Ps_[0]).T, Os[0] @ np.ones(n), rcond=None
    )

    R = np.diag(d) @ R_.T
    Ys = R @ Ys_
    Ps = np.array([Y @ P_ for Y, P_ in zip(Ys, Ps_)])
    Ss = np.array([R @ Z_ @ Y_.T @ R.T for Z_, Y_ in zip(Zs_, Ys_)])

    S_ = np.zeros((L, n))
    Ms_ = np.zeros((L, n, n))
    for l in range(L):
        for i in range(n):
            S_[l, i] = Ss[i, l, l]
            for j in range(n):
                Ms_[l, i, j] = Ps[j, l, i] / S_[l, i]

    S_ = np.abs(S_)
    Ms_ = np.abs(Ms_)
    learned_mixture = Mixture(S_, Ms_)
    learned_mixture.normalize()
    return learned_mixture

learners = {
    "CA-SVD": svd_learn_new,
    "CA-SVD'": lambda d, n, L: svd_learn_new(d, n, L, sample_dist=0.01),
    "GKV-SVD": svd_learn,
    "EM2": lambda d, n, L: em_learn(d, n, L, max_iter=2),
    "EM5": lambda d, n, L: em_learn(d, n, L, max_iter=5),
    "EM20": lambda d, n, L: em_learn(d, n, L, max_iter=20),
    "EM50": lambda d, n, L: em_learn(d, n, L, max_iter=50),
    "EM100": lambda d, n, L: em_learn(d, n, L, max_iter=100),
    "EM-converge": em_learn,
    "CA-SVD-EM2": lambda d, n, L: svd_learn_new(d, n, L, em_refine_max_iter=2),
    "CA-SVD-EM5": lambda d, n, L: svd_learn_new(d, n, L, em_refine_max_iter=5),
    "CA-SVD-EM20": lambda d, n, L: svd_learn_new(d, n, L, em_refine_max_iter=20),
    "CA-SVD-EM100": lambda d, n, L: svd_learn_new(d, n, L, em_refine_max_iter=100),
}

def count_3_from_seq(seq, n):
    """
    seq: discretized sequence
    n: number of categories
    """
    all_trail_probs = np.zeros((n, n, n)) 
    for i in range(len(seq) // 3):
        x = seq[3*i:3*(i+1)]
        all_trail_probs[tuple(x)] += 1
       #num_visited[x] += 1
    return Distribution.from_all_trail_probs(all_trail_probs / np.sum(all_trail_probs))
    
def learn_mix_from_seq(seq,learner, n, L):
    """
    seq: discretized time series: an 1-d array
    learner: 
    """
    trail_empirical_distribution = count_3_from_seq(seq, n)
    if np.isnan(trail_empirical_distribution.all_trail_probs()).any() or np.isinf(trail_empirical_distribution.all_trail_probs()).any():
        print("Inf or NAN values")
        print(trail_empirical_distribution.all_trail_probs())
        
    return  learners[learner](trail_empirical_distribution, n, L)

def likelihood(mixture, trails, counts=None, log=False):
    if counts is None: counts = transitions(mixture.n, trails)
    logS = np.log(mixture.S + 1e-10)
    logTs = np.log(mixture.Ms + 1e-10)

    logl = logS[:, trails[:,0]]
    logl += np.sum(logTs[:, :, :, None] * np.moveaxis(counts, 0, 2)[None, :, :, :], axis=(1,2))
    if log: return logl
    probs = np.exp(logl - np.max(logl, axis=0))
    probs /= np.sum(probs, axis=0)[None, :]
    return probs

def transitions(n, trails):
    n_samples = trails.shape[0]
    c = np.zeros([n_samples, n, n], dtype=int)
    for t, trail in enumerate(trails):
        i = trail[0]
        for j in trail[1:]:
            c[t, i, j] += 1
            i = j
    return c

In [1]:
n_samples = 1
d = int(1e5)
for k_samples in (n_samples // d) * [d] + [n_samples % d]:
    print(k_samples)

UndefVarError: UndefVarError: `int` not defined

In [119]:
n_states = 60 
L_chains = 5
current_state = 3
mix = Mixture.random(n_states, L_chains)
distribution = Distribution.from_mixture(mix,3)
sample_distribution = distribution.sample()

In [124]:
num_categories = n_states
window = len(xs)//10
L = 5
correct_count = 0
error = []
neg_ll = []
predict = []
for i in range(10):
    subseq = xs[i:i+window]
    learned_mix = learn_mix_from_seq(subseq,'GKV-SVD', num_categories, L)
    chain_prob = likelihood(learned_mix, np.atleast_2d(subseq[-2:]))
    #most_likely_index = np.argmax(chain_prob)
    #multi_dim_index = np.unravel_index(most_likely_index, learned_mix.S.shape)
    # Based on likelihood probability to find the most likely chain.
    most_likely_chain = np.argmax(chain_prob)
    prob_next_step = learned_mix.Ms[most_likely_chain, subseq[window - 1], :]
    neg_log_likelihood = -np.log(prob_next_step[xs[i + window]]) + np.log(np.max(prob_next_step))
    sorted_indices = np.argsort(prob_next_step)
    predict.append(np.argmax(prob_next_step))
    rank = np.where(sorted_indices == xs[i + window])[0][0]

    neg_ll.append(neg_log_likelihood)
    error.append(59 - rank)


  Ms_[l, i, j] = Ps[j, l, i] / S_[l, i]
  Ms_[l, i, j] = Ps[j, l, i] / S_[l, i]
  neg_log_likelihood = -np.log(prob_next_step[xs[i + window]]) + np.log(np.max(prob_next_step))
  self.Ms = self.Ms / np.sum(self.Ms, axis=2)[:, :, np.newaxis]


In [129]:
predict = np.array(predict)
print(abs(predict - xs[window :window + 10]).mean())
print((predict == xs[window:window + 10]).astype(int))

15.3
[0 0 0 0 0 0 0 0 0 0]


In [84]:
error = np.array(error)
error.mean()

30.37

In [None]:
print(predict == xs)

In [40]:
import statsmodels.api as sm
import pandas as pd


In [65]:
#endog = pd.read_csv('energydata_complete.csv')['RH_5']
endog = pd.DataFrame(xs, columns=['self_generate'])
# We could also fit a more complicated model with seasonal components.
# As an example, here is an SARIMA(1,1,1) x (0,1,1,4):


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.55987D+00    |proj g|=  8.46152D-02

At iterate    5    f=  4.45998D+00    |proj g|=  2.12469D-02

At iterate   10    f=  4.27001D+00    |proj g|=  5.26035D-03
  ys=-4.587E-07  -gs= 2.161E-07 BFGS update SKIPPED

At iterate   15    f=  4.26999D+00    |proj g|=  2.68122D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     15     35      1     1     0   2.681D-04   4.270D+00
  F =   4.2699921815012853     

CONVERGENCE: REL_

In [120]:
# num_categories = 60
# n = num_categories

# all_trail_probs = np.zeros((n, n, n))
# num_visited = np.zeros(num_categories)

# df = pd.read_csv('energydata_complete.csv')
# # consider one column first
# #df = df['RH_5']
# #xs = pd.cut(df, bins=num_categories, labels=False)
# res = pd.qcut(df['RH_5'],n, labels=False, retbins=True, precision=3, duplicates='raise')
# # do equal-depth p
# xs = np.array(list(res[0]))
# bins = res[1]


predict_sarima = []
for i in range(10):
    subseq = xs[i:i+window]
    df = pd.DataFrame(subseq)

    mod_sarimax = sm.tsa.SARIMAX(subseq, order=(1,1,1),
                                seasonal_order=(0,1,1,4))
    res_sarimax = mod_sarimax.fit()

    #res = mod_sarimax.filter(res_sarimax.params)

    # Show the summary of results
    pred = res_sarimax.get_prediction(window,window).predicted_mean
    
    predict_sarima.append(pred)
   


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.56286D+00    |proj g|=  8.58913D-02

At iterate    5    f=  4.43606D+00    |proj g|=  2.08559D-02

At iterate   10    f=  4.28351D+00    |proj g|=  2.96234D-03

At iterate   15    f=  4.28297D+00    |proj g|=  1.52917D-02

At iterate   20    f=  4.28221D+00    |proj g|=  7.10157D-03

At iterate   25    f=  4.28218D+00    |proj g|=  6.86052D-05
  ys=-3.538E-04  -gs= 4.701E-05 BFGS update SKIPPED



 Bad direction in the line search;
   refresh the lbfgs memory and restart the iteration.
 This problem is unconstrained.



At iterate   30    f=  4.28214D+00    |proj g|=  4.39419D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     32     71      2     1     0   1.776D-05   4.282D+00
  F =   4.2821441249372052     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.56332D+00    |proj g|=  8.60140D-02

At iterate    5    f=  4.43855D+00    |proj g|=  2.08828D-02

At iterate   10    f=  4.28397D+00    |proj g|=  2.36307D-03

At iter


   evaluations in the last line search.  Termination
   may possibly be caused by a bad search direction.
 This problem is unconstrained.



At iterate   10    f=  4.28401D+00    |proj g|=  3.12338D-04

At iterate   15    f=  4.28328D+00    |proj g|=  4.95509D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     19     30      1     0     0   8.418D-04   4.283D+00
  F =   4.2828503318437905     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.56276D+00    |proj g|=  8.56395D-02

At iterate    5    f=  4.43904D+00    |proj g|=  2.08914D-02


 This problem is unconstrained.



At iterate   10    f=  4.28388D+00    |proj g|=  3.34709D-04

At iterate   15    f=  4.28303D+00    |proj g|=  4.59223D-03

At iterate   20    f=  4.28264D+00    |proj g|=  1.91511D-04

At iterate   25    f=  4.28261D+00    |proj g|=  4.24915D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     29     47      1     0     0   5.397D-05   4.283D+00
  F =   4.2826128539143138     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iter

 This problem is unconstrained.



At iterate   10    f=  4.28377D+00    |proj g|=  5.41849D-04

At iterate   15    f=  4.28275D+00    |proj g|=  3.24899D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     16     24      1     0     0   3.077D-04   4.283D+00
  F =   4.2827471480105075     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.56248D+00    |proj g|=  8.64320D-02

At iterate    5    f=  4.43951D+00    |proj g|=  2.08991D-02


 This problem is unconstrained.



At iterate   10    f=  4.28375D+00    |proj g|=  1.21768D-03

At iterate   15    f=  4.28263D+00    |proj g|=  2.25231D-04
  ys=-3.041E-05  -gs= 3.078E-05 BFGS update SKIPPED



   evaluations in the last line search.  Termination
   may possibly be caused by a bad search direction.
 This problem is unconstrained.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     18     41      1     1     0   4.328D-05   4.283D+00
  F =   4.2825126642285092     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.56156D+00    |proj g|=  8.58592D-02

At iterate    5    f=  4.47452D+00    |proj g|=  2.07429D-02

At iterate   10    f=  4.28278D+00    |proj g|=  3.01054D-04

At iterate   15    f=  4.28255D+00    |proj g|=  1.43916D-03

At iter


   evaluations in the last line search.  Termination
   may possibly be caused by a bad search direction.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.56180D+00    |proj g|=  8.64715D-02

At iterate    5    f=  4.43884D+00    |proj g|=  2.09149D-02

At iterate   10    f=  4.28345D+00    |proj g|=  7.94482D-04

At iterate   15    f=  4.28238D+00    |proj g|=  4.05598D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     16     25      1     0     0   4.051D-04   4.282D+00
  F =   4.2823764476144639     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING 

 This problem is unconstrained.



At iterate   15    f=  4.28279D+00    |proj g|=  5.93934D-03

At iterate   20    f=  4.28232D+00    |proj g|=  1.26492D-03

At iterate   25    f=  4.28227D+00    |proj g|=  8.07707D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     29     56      1     0     0   1.160D-03   4.282D+00
  F =   4.2822589905038368     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.56082D+00    |proj g|=  8.64364D-02

At iter


   evaluations in the last line search.  Termination
   may possibly be caused by a bad search direction.
 This problem is unconstrained.



At iterate   10    f=  4.28380D+00    |proj g|=  3.98874D-04

At iterate   15    f=  4.28279D+00    |proj g|=  1.04912D-02

At iterate   20    f=  4.28221D+00    |proj g|=  1.83875D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     22     35      1     0     0   1.011D-03   4.282D+00
  F =   4.2822106817570988     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


In [96]:
res.get_prediction(18000).predicted_mean

18000    29.7122
dtype: float64

In [121]:
predict_sarima = np.array(predict_sarima).squeeze()
abs(predict_sarima - xs[window : window+10]).mean()

17.078359221910237

In [116]:
abs(xs - pred.astype(int)).mean()


15.71816569546491

In [60]:
%pip install hmmlearn

Collecting hmmlearn
  Downloading hmmlearn-0.3.2-cp38-cp38-macosx_10_9_universal2.whl.metadata (2.9 kB)
Downloading hmmlearn-0.3.2-cp38-cp38-macosx_10_9_universal2.whl (192 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.3/192.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hmmlearn
Successfully installed hmmlearn-0.3.2
Note: you may need to restart the kernel to use updated packages.


In [69]:
import numpy as np
from hmmlearn import hmm


remodel = hmm.CategoricalHMM(n_components=5)
remodel.fit(xs.reshape(-1,1))
Z2 = remodel.predict(xs.reshape(-1,1))

In [73]:
print(abs(Z2-xs).mean())

26.840833333333332
