In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.mixture as mix

In [2]:
data = pd.read_excel('N225.xlsx', sheet_name='N225').dropna()
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1988-04-08,26767.289063,26926.410156,26700.919922,26748.890625,26748.890625,0.0
1,1988-04-11,26761.220703,26998.359375,26761.220703,26924.869141,26924.869141,0.0
2,1988-04-12,26939.130859,26985.740234,26845.019531,26930.839844,26930.839844,0.0
3,1988-04-13,26942.300781,26995.279297,26922.560547,26985.550781,26985.550781,0.0
4,1988-04-14,26969.369141,27123.189453,26969.369141,27111.349609,27111.349609,0.0
5,1988-04-15,27103.929688,27103.929688,26774.130859,26893.570313,26893.570313,0.0
6,1988-04-18,26883.839844,26984.109375,26671.449219,26671.449219,26671.449219,0.0
7,1988-04-19,26653.250000,26676.839844,26444.810547,26657.000000,26657.000000,0.0
8,1988-04-20,26662.779297,26894.240234,26662.779297,26864.089844,26864.089844,0.0
9,1988-04-21,26870.740234,26870.740234,26790.789063,26828.160156,26828.160156,0.0


In [None]:
dates = data['Date'].values
opens = data['Open'].values
closes = data['Close'].values
highs = data['High'].values
lows = data['Low'].values
adjcloses = data['Adj Close'].values

# The factors:
opclsp = ((data['Open']-data['Close'])/data['Open']).values*100# % change in price during the day
highlowsp = ((data['High']-data['Low'])/data['Close']).values*100 # Spread of high low
prc_ret = data['Close'].pct_change().fillna(0)*100 # Daily price change
data['Return Daily'] = prc_ret
Vol_change = data['Volume'].pct_change().fillna(0)*100
Vol_change[Vol_change==np.inf] = 0
log_ret = np.log((prc_ret+100)/100)


slices = [int(len(data)*0.2),int(len(data)*0.4),int(len(data)*0.6),int(len(data)*0.8),len(data)-1]
slices

# Slice off according to the index and return the rest
def slicer(array, test_p):
    a_shape = array.shape
    head = array[0:test_p[0]]
    tail = array[test_p[1]:]
    ret = np.append(head,tail)
    return ret.reshape(int(len(ret)/a_shape[1]),a_shape[1])

to_float = lambda x : [round(x[i],4) for i in range(len(x))]

# structuring regression data
def structuring_data(test_p,
                     window=0,
                     vollst=0):
    hl = highlowsp[window:]
    oc = opclsp[window:]
    pr = prc_ret[window:]
    date = dates[window:]
    vc = Vol_change[window:]
    lr = log_ret[window:]
    #vol = vollst[window:]
    log_diff = np.append([0],np.diff(lr))
    x = np.column_stack([lr])
    lengthx = len(x)
    #x_train, x_test, dates_train, dates_test = model_selection.train_test_split(x, dates[window:], test_size = 0.2, random_state = 2019)
    x_train, x_test, dates_train, dates_test = slicer(x,test_p), x[test_p[0]:test_p[1]], np.append(date[0:test_p[0]], date[test_p[1]:]), date[test_p[0]:test_p[1]]
    #return x_train
    model = mix.GaussianMixture(n_components=2,covariance_type="full")
    model.fit(x_train)
    train_hiddens = model.predict(x_train)
    print('Training Period Variances:')
    print(np.diag(model.covariances_[0]))
    print(np.diag(model.covariances_[1]))
    
    hiddens = model.predict(x_test)
    print('Score for test:',model.score(x_test))
    print('Score for training:',model.score(x_train))
    
    print('States 1: %d. States 2: %d.'%(len(hiddens[hiddens==0]),len(hiddens[hiddens==1])))
    np.set_printoptions(suppress=True)
    print('The means of state 0:')
    print(model.means_[0])
    print('The means of state 1:')
    print(model.means_[1])

    print(len(x_test),len(dates_test))
    closes_train = np.append(closes[0:test_p[0]], closes[test_p[1]:])
    closes_test = closes[test_p[0]:test_p[1]]
    lr_train = np.append(lr[0:test_p[0]], lr[test_p[1]:])
    lr_test = np.array(lr[test_p[0]:test_p[1]])
        
    print(len(train_hiddens),len(closes_train),len(dates_train),len(lr_train),len(hiddens),len(closes_test),len(dates_test),len(lr_test))
    
    return [train_hiddens, dates_train, closes_train, lr_train, 
            hiddens, dates_test, closes_test, lr_test, 
            [np.diag(model.covariances_[0]),np.diag(model.covariances_[1])]]

def plot_scatter(ret, wind, vollst, highvol_mark, 
                 testing=True):
    fig = plt.figure()
    fig.set_size_inches((18.5,10.5))
    if testing:
        for i in range(wind,len(ret[4])):
            if ret[4][i]==highvol_mark:
                plt.scatter(ret[7][i], vollst[i], color='r', s=4)
            else:
                plt.scatter(ret[7][i], vollst[i], color='g', s=4)
        plt.title('Testing Period LogReturn with Vol')
                
    else: #Training
        for i in range(wind,len(ret[0])):
            if ret[0][i]==highvol_mark:
                plt.scatter(ret[3][i], vollst[i], color='r', s=2)
            else:
                plt.scatter(ret[3][i], vollst[i], color='g', s=2)
        plt.title('Training Period LogReturn with Vol')
    plt.show()
    return