This is the IOHMM model with the parameters learned in a semi-supervised way. By using some labeled data, we force the learning process in a certain direction. The unlabeled data will be estimated using EM algorithm iteratively. See notes in http://pages.cs.wisc.edu/~jerryzhu/pub/sslicml07.pdf

# Example use of SemiSupervised_IOHMM 

In [1]:
%load_ext autoreload
%autoreload 2

from __future__ import  division
import numpy as np
from copy import deepcopy
import sys
sys.path.append('./main')
sys.path.append('./auxiliary')
from IOHMM import SemiSupervisedIOHMM, SemiSupervisedIOHMMMapReduce
from SupervisedModels import LM, MNLP, MNLD
from scipy.misc import logsumexp
import pandas as pd
import warnings
warnings.simplefilter("ignore")

## Speed data

### The sequence data

In [2]:
speed = pd.read_csv('data/speed.csv')
speed.head()

Unnamed: 0.1,Unnamed: 0,rt,corr,Pacc,prev
0,1,6.45677,cor,0.0,inc
1,2,5.602119,cor,0.0,cor
2,3,6.253829,inc,0.0,cor
3,4,5.451038,inc,0.0,inc
4,5,5.872118,inc,0.0,inc


### The labeled states

In our structure of the code, the states should be a dictionary, the key is the index in the sequence (e.g. 0, 5) and the value is a one-out-of-n code of array where the kth value is 1 if the hidden state is k. n is the number of states in total.

In the following example, we assume that the "corr" column gives the correct hidden states. Here we assume only the first half of the sequence is labeled.

In [3]:
speed.shape[0]

439

In [4]:
states = {}
corr = np.array(speed['corr'])
for i in range(int(len(corr)/2)):
    state = np.zeros((4,))
    if corr[i] == 'cor':
        states[i] = np.array([0,1,0,0])
        speed.set_value(i, 'rt', 1)
    else:
        states[i] = np.array([1,0,0,0])
        speed.set_value(i, 'rt', 0)

In [5]:
speed.shape[0]

439

In [6]:
len(states)

219

## Setting up the model

In [7]:
SHMM = SemiSupervisedIOHMM(num_states=4)
SHMM.setModels(model_emissions = [LM()], model_transition=MNLP(solver='lbfgs'))
SHMM.setInputs(covariates_initial = [], covariates_transition = [], covariates_emissions = [[]])
SHMM.setOutputs([['rt']])
SHMM.setData([[speed, states]])

## Start training

In [8]:
SHMM.train()

-164.959670354
-162.025020679
-160.684435566
-160.290398374
-160.05216391
-159.007237916
-143.002425466
-97.0842474111
-69.2584066599
-66.486410055
-61.8979553772
-61.6762352018
-57.1299413718
-49.7340004002
-44.2543645717
-44.220820683
-44.2210747114
-44.35206149
-44.3903753351
-44.3985865598
-44.4003312251
-44.4007000757
-44.4007779689


## See coefficients

In [9]:
print np.exp(SHMM.model_transition[0].coef - logsumexp(SHMM.model_transition[0].coef))
print np.exp(SHMM.model_transition[1].coef - logsumexp(SHMM.model_transition[1].coef))

[[ 0.42176108  0.5332247   0.02250711  0.02250711]]
[[ 0.15113613  0.81203853  0.01918793  0.01763742]]


In [10]:
print SHMM.model_emissions[0][0].coef
print SHMM.model_emissions[1][0].coef
print SHMM.model_emissions[2][0].coef
print SHMM.model_emissions[3][0].coef

[-0.]
[ 1.]
[ 5.47279257]
[ 6.39096276]


In [11]:
print np.sqrt(SHMM.model_emissions[0][0].dispersion)
print np.sqrt(SHMM.model_emissions[1][0].dispersion)
print np.sqrt(SHMM.model_emissions[2][0].dispersion)
print np.sqrt(SHMM.model_emissions[3][0].dispersion)

0.0
0.0
0.182276997053
0.224620703134


## MapReduce Version

In [12]:
sc.stop()
sc = SparkContext(appName="Python_UnSupervised_IOHMM_MapReduce", pyFiles=[
    './auxiliary/HMM.py',
    './auxiliary/SupervisedModels.py',
    './auxiliary/family.py',
    './main/IOHMM.py'])

In [13]:
speed = pd.read_csv('data/speed.csv')
states = {}
corr = np.array(speed['corr'])
for i in range(int(len(corr)/2)):
    state = np.zeros((4,))
    if corr[i] == 'cor':
        states[i] = np.array([0,1,0,0])
        speed.set_value(i, 'rt', 1)
    else:
        states[i] = np.array([1,0,0,0])
        speed.set_value(i, 'rt', 0)

In [14]:
indexes = [(1,1), (2,1)]
RDD = sc.parallelize(indexes)
dfs_states = RDD.mapValues(lambda v: [speed, states])

In [16]:
SHMM = SemiSupervisedIOHMMMapReduce(num_states=4, max_EM_iter=100, EM_tol=1e-6)
SHMM.setModels(model_emissions = [LM()], model_transition=MNLP(solver='lbfgs'))
SHMM.setInputs(covariates_initial = [], covariates_transition = [], covariates_emissions = [[]])
SHMM.setOutputs([['rt']])
SHMM.setData(dfs_states)
SHMM.train()
print 'done'

-323.817629492
-323.655376194
-322.264357484
-322.36577288
-322.460203662
-322.36518968
-323.067586601
-323.160001222
-322.430550335
-322.923114532
-322.532788688
-323.236472578
-323.2748324
-322.876876983
-323.060060092
-322.793643075
-322.707212423
-322.668641465
-322.543763646
-322.447603007
-322.090366888
-321.572216489
-321.555608227
-320.941027603
-321.563730232
-320.680582288
-318.642259708
-313.296724295
-296.739459247
-269.902260126
-168.533162104
-95.2275142182
-92.4586323464
-91.1610817588
-90.934065195
-90.8776724129
-90.8626762541
-90.8585893727
-90.8574677359
-90.8571593068
-90.8570744492
-90.857051099
-90.8570446735
-90.8570429053
-90.8570424187
done


In [17]:
print SHMM.model_emissions[0][0].coef
print SHMM.model_emissions[1][0].coef
print SHMM.model_emissions[2][0].coef
print SHMM.model_emissions[3][0].coef

[-0.]
[ 1.]
[ 6.39057687]
[ 5.47205369]


In [18]:
sc.stop()