<a href="https://colab.research.google.com/github/Moojin-Bin/Hidden_Markov_Model/blob/master/GMM_for_Finance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

an Application of Gaussian Mixture Models for Financial Markets

In [None]:
import numpy as np
import pandas as pd
import pandas_datareader as pdr
from sklearn import mixture
from matplotlib import pyplot as plt

  from pandas.util.testing import assert_frame_equal


In [None]:
def df_to_array(df):
    array = np.ravel(df.dropna()).astype(np.float).reshape(-1, 1)
    return array

In [None]:
def plot_best_fit_mixture(df, max_n_components):
    
    X = df_to_array(df)
    
    N = np.arange(1, max_n_components+1)
    models = [None]*max_n_components

    for i in range(max_n_components):
        models[i] = mixture.GaussianMixture(N[i], n_init = 10).fit(X)

    AIC = [m.aic(X) for m in models]
    BIC = [m.bic(X) for m in models]
    M_best = models[min(np.argmin(AIC), np.argmin(BIC))]

    fig = plt.figure(figsize = (12, 4))
    fig.subplots_adjust(wspace=0.3)

    # plot 1: Information Critetion
    ax = fig.add_subplot(131)
    ax.plot(N, AIC, '-k', label = 'AIC')
    ax.plot(N, BIC, '--k', label = 'BIC')
    ax.set_title('Theoretical Information Criterion')
    ax.set_xlabel('n. components')
    ax.set_ylabel('information criterion')
    ax.legend()


    # plot 2: Histogram and best-fit mixture
    ax = fig.add_subplot(132)

    x_min = X.min() - 0.2
    x_max = X.max() + 0.2
    x = np.linspace(x_min, x_max, 1000)
    logprob = M_best.score_samples(x.reshape(-1, 1))    # Compute the weighted log probabilities for each sample.
    responsibilities = M_best.predict_proba(x.reshape(-1, 1))    # Predict posterior probability of each component given the data.
    pdf = np.exp(logprob)    # probability density function, it can have a value greater than 1.
    pdf_individual = responsibilities * pdf[:, np.newaxis]

    ax.hist(X, 30, density=True, histtype='stepfilled', alpha=0.4)
    ax.plot(x, pdf, '-k')
    ax.plot(x, pdf_individual, '--k')
    ax.set_title("Best-fit Mixture")
    ax.set_xlabel('$x$')
    ax.set_ylabel('$p(x)$')


    # plot 3: Posterior probabilities for each component
    ax = fig.add_subplot(133)

    best_fit_n_components = M_best.n_components
    p = responsibilities
    p = p.cumsum(1).T

    for i in range(best_fit_n_components):
        if i == 0:
            ax.fill_between(x, 0, p[i], color='gray', alpha = 1/best_fit_n_components)
        else:
            ax.fill_between(x, p[i-1], p[i], color='gray', alpha = (1+i)/best_fit_n_components)
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(0, 1)
    ax.set_title('Posterior Probabilities')
    ax.set_xlabel('$x$')
    ax.set_ylabel(r'$p({\rm class}|x)$')

In [None]:
def GMM(df, begin, n_components):
    
    df = df[begin:].fillna(method='ffill')

    data = df_to_array(df)

    gmm = mixture.GaussianMixture(n_components)
    gmm.fit(data)

    hidden_states = gmm.predict(data)
    hidden_states_prob = gmm.predict_proba(data)

    z = {}
    stat = {}    # Statistics = {Minimum, Maximum, Mean}
    for i in range(n_components):
        mask = hidden_states == i
        z['Regime_%d' %i] = pd.DataFrame(df.values[mask], index = df.index[mask], columns = ['Regime_%d' %i])
        z['Regime_%d' %i] = z['Regime_%d' %i].reindex(df.index, fill_value=np.nan)
        stat['Regime_%d_min' %i] = z['Regime_%d' %i].min()
        stat['Regime_%d_avg' %i] = z['Regime_%d' %i].mean()
        stat['Regime_%d_max' %i] = z['Regime_%d' %i].max()
    
    hsp = pd.DataFrame(hidden_states_prob, index=df.index, columns = z.keys())    # Hidden States Probability

    regimes = list(z.keys())
    gmm_res = pd.DataFrame([])
    for i in range(n_components):
        gmm_res = pd.concat([gmm_res, z[regimes[i]]], axis=1)

    stats = pd.DataFrame.from_dict(stat, orient='index')
    
    return gmm_res, stats, hsp


In [None]:
df = pdr.DataReader('VIXCLS', "fred", '19700101')