<a href="https://colab.research.google.com/github/Mosenith/Colab_project/blob/main/HMM_Khmer_Segmnt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Download the file from `url` and save it to outfile
import urllib.request
size = ["100","500","1000","5000","10000_seg"]
# set file size for list of available size (0-5) that you want to use
docsize = size[0]
data_dir = "kh_data_" + docsize
file_name = data_dir + ".zip"
base_url = "https://github.com/phylypo/segmentation-crf-khmer/raw/master/data/"
url = base_url + file_name
print("Downloading from:", url)
urllib.request.urlretrieve(url, file_name)

Downloading from: https://github.com/phylypo/segmentation-crf-khmer/raw/master/data/kh_data_100.zip


('kh_data_100.zip', <http.client.HTTPMessage at 0x7f29e676b5d0>)

In [2]:
print("- Unzipping the file and show last few extracted files:")
!unzip {file_name} | tail -10

print("- Count the number of files:")
!ls -al {data_dir}/*_seg.txt | wc -l
!ls

- Unzipping the file and show last few extracted files:
  inflating: kh_data_100/313540_seg.txt  
  inflating: kh_data_100/313541_orig.txt  
  inflating: kh_data_100/313541_seg.txt  
  inflating: kh_data_100/313544_orig.txt  
  inflating: kh_data_100/313544_seg.txt  
  inflating: kh_data_100/313545_orig.txt  
  inflating: kh_data_100/313545_seg.txt  
  inflating: kh_data_100/313546_orig.txt  
  inflating: kh_data_100/313546_seg.txt  
  inflating: kh_data_100/meta.txt    
- Count the number of files:
100
kh_data_100  kh_data_100.zip  sample_data


In [3]:
# Combine the content of files into one file for training and test
#!ls -alh kh_data_100/313[34]*_seg.txt
!cat kh_data_100/313[34]*_seg.txt > khmer_seg_train.txt
!cat kh_data_100/3135*_seg.txt > khmer_seg_test.txt
#!head khmer_seg_train.txt

#total line: 982 , 35K words
!wc khmer_seg_train.txt      # line count: 799, words: 28,287
!wc khmer_seg_test.txt # line count: 182, words:  7,072


   799  28287 416448 khmer_seg_train.txt
   183   7072 101702 khmer_seg_test.txt


**Model**

In [4]:
class Model(object):
    def __init__(self,states,observation,phi,trans_prob,conf_prob):
        self._states = states
        self._observation = observation
        self._phi = phi
        self._trans_prob = trans_prob
        self._conf_prob = conf_prob

    def states_length(self):
        #Return the length of the states
        return len(self._states)

    def _forward(self,observations):
        #The implemention of the forward algorithm
        s_len = self.states_length
        o_len = len(observations)
        '''
        This step should cal the alpha_t(j)
        the t is the length of the observations,
        the j is the hidden states
        '''
        alpha = [[] for i in range(o_len)]
        
        alpha[0] = {}
        #t=1,cal the intil alpha_1(j)
        for state in self._states:
            alpha[0][state] = self._conf_prob[state][observations[0]]*self._phi[state]
        
        #t>1,cal the local prob alpha_t(j)
        for index in range(1,o_len):
            alpha[index] ={}
            for state_to in self._states:
                #the time t the prob all path that direct to states_to
                prob = 0
                for state_from in self._states:
                    prob += alpha[index-1][state_from]*self._trans_prob[state_from][state_to]
                alpha[index][state_to]=self._conf_prob[state_to][observations[index]]*prob
        return alpha
        
    def _backward(self,observations):
        #The implementation of the backward algorithm
        s_len = self.states_length
        o_len = len(observations)
        '''
        This step should cal the beta_t(j)
        the t is the location of the observations,
        the j is the hidden states
        beta_t(j) = p(o_(t+1)...o_T|q_t=s_j,\lambda)
        '''
        beta = [[] for i in range(o_len)] 
        beta[o_len-1] = {}
        #t=T,the intial beta_T(j)
        for state in self._states:
            beta[o_len-1][state] = 1
        
        #t<T,cal the local prob beta_t(j)
        index = len(observations)-1
        while index > 0:
            beta[index-1] = {}
            for state_from in self._states:
                prob = 0
                for state_to in self._states:
                    prob += self._trans_prob[state_from][state_to] * \
                        self._conf_prob[state_to][observations[index]]* \
                        beta[index][state_to]
                beta[index-1][state_from] = prob
            index -= 1
        return beta
        
    def _viterbi(self,observations):
        #The implemention of the viterbi algorithm
        s_len = self.states_length
        o_len = len(observations)
        '''
        This step should cal the beta_t(j),
        the t is the length of the observations,
        the j is the hidden states,
        the beta_t(j) means at time t the most probable 
        local path to state j
        '''
        beta = [[] for i in range(o_len)]
        beta[0] = {}
        
        for state in self._states:
            beta[0][state] = self._conf_prob[state][observations[0]]*self._phi[state]
            
        #t>1,cal the local prob beta_t(j)
        for index in range(1,o_len):
            beta[index] = {}
            for state_to in self._states:
                #build a list to save the beta_t-1(j)a_jib_ikt
                prob = []
                for state_from in self._states:
                    temp = beta[index-1][state_from]*self._trans_prob[state_from][state_to]*self._conf_prob[state_to][observations[index]]
                    prob.append(temp)
                prob =sorted(prob,reverse = True)
                beta[index][state_to] = prob[0]
        return beta
    
    def _backward_point(self,beta,observations,state):
        """
        rely on the beta to get the state sequences that best 
        explain the observation sequences
        """
        index = len(observations)-1
        theta =[0 for i in range(len(observations))]
        theta[index] = state
        while index >0:
            prob = {}
            for state_from in self._states:
                prob[state_from] = beta[index-1][state_from]*self._trans_prob[state_from][state]
            state = sorted(prob,key=prob.get,reverse=True)[0]
            index -= 1
            theta[index] = state
        return theta
        
    def _inverse(self,beta):
        result = [0 for i in range(len(beta))] 
        length = len(beta)
        for i in range(len(beta)):
            result[i] = beta[length-i-1]
        return result
    
    def _intial_par(self):
        '''
        phi,trans_prob,conf_prob = {},{},{}
        N = len(self._states)
        M = len(self._observation)
        for state in self._states:
            phi[state] = 1.0/N
            trans_prob[state] = {}
            for state_to in self._states:
                trans_prob[state][state_to] = 1.0/N
            conf_prob[state] = {}
            for ob in self._observation:
                conf_prob[state][ob] = 1.0/M
        '''
        phi = self._phi
        trans_prob = self._trans_prob
        conf_prob = self._conf_prob
        return (phi,trans_prob,conf_prob)

    def _cal_gamma(self,alpha,beta,observations):
        T = len(observations)
        gamma = [[] for x in range(T)]
        for t in range(T):
            gamma[t] = {}
            sum_prob = 0
            for state in self._states:
                prob = alpha[t][state]*beta[t][state]
                sum_prob += prob
                gamma[t][state] = prob
            for state in self._states:
                if gamma[t][state] == 0:
                    continue
                else:
                    gamma[t][state] /= sum_prob
        return gamma
        
    def _cal_espi(self,alpha,beta,trans_prob,conf_prob,observations):
        T = len(observations)
        espi = [[] for x in range(T-1)]
        for t in range(T-1):
            espi[t] = {}
            sum_prob = 0
            for state_i in self._states:
                espi[t][state_i] = {}
                for state_j in self._states:
                   prob = alpha[t][state_i]*trans_prob[state_i][state_j]*conf_prob[state_j][observations[t+1]]*beta[t+1][state_j]
                   espi[t][state_i][state_j] = prob
                   sum_prob += prob
            for i in self._states:
                for j in self._states:
                    if espi[t][i][j] == 0:
                        continue
                    else:
                        espi[t][i][j] /= sum_prob
        return espi
        
    def _evaluate_par(self,gamma,espi,observations):
        T = len(observations)
        phi = gamma[0]
        trans_prob,conf_prob = {},{}
        for state in self._states:
            trans_prob[state] = {}
            conf_prob[state] = {}
        for i in self._states:
            for j in self._states:
                gamma_t,espi_t = 0,0
                for t in range(T-1):
                    espi_t += espi[t][i][j]
                    gamma_t += gamma[t][i]
                trans_prob[i][j] = espi_t/gamma_t
        for state in self._states:
            for o in self._observation:
                gamma_con_t ,gamma_t = 0,0
                for t in range(T):
                    if observations[t] == o:
                        gamma_con_t += gamma[t][state]
                    gamma_t = gamma[t][state]
                conf_prob[state][o] = gamma_con_t/gamma_t
        return (phi,trans_prob,conf_prob)

    
    def evaluate(self,observations):
        """
        use the forward algorithm to cal the 
        prob of the observation sequence under the HMM Model
        """
        length = len(observations)
        if length == 0:
            return 0
        
        alpha = self._forward(observations)
        prob = sum(alpha[length-1].values())
        return prob
        
    def decode(self,observations):
        """
        user the be viterbi algorithm to cal the most probable 
        hidden state sequence to the observations sequence ,
        """
        length = len(observations)
        if length == 0 :
            return 0
        beta = self._viterbi(observations)
        #get the last state to the last obseravtions
        sequence = beta[length-1]
        state = sorted(sequence,key=sequence.get,reverse=True)[0]
        theta = self._backward_point(beta,observations,state)
        return theta


**Process**

In [5]:
#@title pre-process code -expand to see detail
import codecs,re
import sys

class Process(object):
    def __init__(self,file_dir,S):
        self._file_dir = file_dir
        self._S = S
        self.labels =[]

    def _str2words(self,test):
        words =[]
        x=codecs.lookup("utf-8")
        for string in test:
            word = x.decode(string[0])[0]
            words.append(word)
        return words

    def _statics(self):
        f = codecs.open(self._file_dir,'rb',encoding = 'utf-8')
        hidden_states,train = [],[]
        for line in f.readlines():
            '''
            First make tag for the tokenize in the corpus
            '''
            hidden_state = ''
            words = []
            # clean up text -- remove 2 spaces to 1, and invisible spaces
            line = line.strip()
            line = line.replace("  "," ")
            line = line.replace('\u200b','') 
            line = line.replace('\r\n','')
            tokenizes = line.split()
            for token in tokenizes:
                length = len(token)
                if length == 1:
                    hidden_state += 'S'
                elif length==2:
                    hidden_state += 'BE'
                else:
                    hidden_state += 'B'+(length-2)*'M'+'E'
            '''
            Second we should extart single character from the corpus
            '''
            line = line.replace(' ','') # remove space
            #print("--line:", line)
            for word in line:
                words.append(word) # this is character, not word
                # can use kcc here
            if len(words) >0:
                train.append(words)
                hidden_states.append(hidden_state)
        print("process._statics: ",self._file_dir, " total word count:", len(train), " len hidden_state:", len(hidden_states))
        print("process._statics: ",self._file_dir, " first word/line:", train[0], " len hidden_state:", len(hidden_states))
        return (hidden_states,train)
            
    def _statics_hidden(self):
        '''
        First,get the tokenize result of the corpus,
        statics the hidden state of each word
        '''
        f = open(self._file_dir,'rb')
        hidden_states,train = [],[]
        regex=re.compile("(?x) ( [\w-]+ | [\x80-\xff]{3} )")
        for line in f.readlines():
            hidden_state = ''
            words = []
            tokenizes = line.split()
            for token in tokenizes:
                temp = [w for w in regex.split(token) if w]
                for t in temp:
                    words.append(t)
                length = len(temp)
                if length == 1:
                    hidden_state += 'S'
                elif length==2:
                    hidden_state += 'BE'
                else:
                    hidden_state += 'B'+(length-2)*'M'+'E'
            if len(words) != 0:
                train.append(words)
                hidden_states.append(hidden_state)
        return (hidden_states,train)

            
    def _word_count(self,train):
        word_count = {}
        for words in train:
            for word in words:
                if word in word_count.keys(): #word_count.has_key(word):
                    word_count[word] += 1
                else:
                    word_count[word] = 1
        return word_count
    
    def _convert(self,hidden_states):
        temp = []
        for index in range(len(hidden_states)):
            regex = re.compile("(\w{1})")
            states = [w for w in regex.split(hidden_states[index]) if w]
            if len(states) !=0:
                temp.append(states)
        return temp
    
    def _cal_trans(self,h_s):
        trans_prob,state_count = {},{}
        #intial
        for state in self._S:
            trans_prob[state]={}
            state_count[state] = 0
            for state_i in self._S:
                trans_prob[state][state_i]=0
        for i in range(len(h_s)):
            length = len(h_s[i])
            for j in range(length-1):
                s_from = h_s[i][j]
                s_to = h_s[i][j+1]
                trans_prob[s_from][s_to] += 1
                state_count[s_from] += 1
            state_count[h_s[i][length-1]] += 1
        print(state_count)
        for i in self._S:
            for j in self._S:
                trans_prob[i][j] /= float(state_count[i])
        return (trans_prob,state_count)
    
    def _cal_conf(self,h_s,test_wordcount,word_count,train,state_count):
        conf_prob = {}
        words = list(set(word_count.keys())|set(test_wordcount.keys()))
        print('The corpus has distinct count %d word'%(len(words)))
        for state in self._S:
            conf_prob[state] = {}
            for word in words:
                conf_prob[state][word] = 1
        for i in range(len(h_s)):
            length = len(h_s[i])
            for j in range(length):
                obser = train[i][j]
                hidden = h_s[i][j]
                conf_prob[hidden][obser] += 1
        for state in self._S:
            for word in words:
                if conf_prob[state][word] == 0:
                    continue
                else:
                    conf_prob[state][word] /= float(state_count[state])
        return conf_prob
        

    def _tran_conf_prob(self,train,test_wordcount,word_count,hidden_states):
        #convert the hidden_state string to list
        hidden_states = self._convert(hidden_states)
        trans_prob,state_count = self._cal_trans(hidden_states)
        conf_prob = self._cal_conf(hidden_states,test_wordcount,word_count,train,state_count)
        
        return (conf_prob,trans_prob)
        
    def _word_sequence(self,test,o_hstate):
        sequence = []
        f= open('result.txt','w')
        print('word_seq len test:', len(test))
        for i in range(len(test)):
            if o_hstate[i][-1] == 'M':
                o_hstate[i][-1] = 'E'
            elif o_hstate[i][-1] == 'B':
                o_hstate[i][-1] = 'S'
            length = len(test[i])
            temp = []
            k = 0
            #print("len(test[i]",len(test[i]), "len(o_hstate[i])",len(o_hstate[i]))
            while k < length:
                if o_hstate[i][k]=='S':
                    temp.append(test[i][k])
                else :
                    s=test[i][k]
                    k+=1
                    try:
                      if k < len(o_hstate[i]): 
                        #add by phyl to check invalid index
                        while k < len(o_hstate[i])-1 and o_hstate[i][k] != 'E' :
                          s += test[i][k]
                          k +=1
                        s += test[i][k]
                        temp.append(s)
                    except: 
                      print("exception i:",i, "k:", k, "len(o_hstate)", len(o_hstate), "len(o_hstate[i])", len(o_hstate[i]))
                k += 1
            f.write('%s\n' %(' '.join(temp)))
            #print(i, " - len(temp):", len(temp), (' - '.join(temp)))
            sequence.append(' '.join(temp))
        f.close()
            
        return sequence
      
print("[DONE]")

[DONE]


**Preprocess**

In [6]:
import codecs
import sys

#train_dir = '/content/icwb2-data/training/pku_training.utf8'
#test_dir = '/content/icwb2-data/testing/pku_test.utf8'
train_dir = 'khmer_seg_train.txt'
test_dir = 'khmer_seg_test.txt'

'''
The number of the hidden states
B:a word at the start
E:a word at the end
M:a word at the middle
S:a word construct the tokenize
'''

S = ['B','E','M','S']
pro = Process(train_dir,S)
hidden_states,train=pro._statics()

pro_test = Process(test_dir,S)
test_states,test = pro_test._statics()

test_wordcount = pro_test._word_count(test)
word_count = pro._word_count(train)

observation = word_count.keys()
print("test state len:",len(test_states), "test states0",test_states[0])
print("Observation:", len(observation), observation)

'''
The conf_prob is the probability of a observation in condition of a hidden state
The trans_prob is the probability of a  hidden state trans to another
This time add the smoothing method.
1.add 1 mehtod
'''
conf_prob,trans_prob=pro._tran_conf_prob(train,test_wordcount,word_count,hidden_states)

print('conf_prob', conf_prob)
print('trans_prob', trans_prob)

observations = test

phi = {'B':0.4,'E':0.4,'M':0.1,'S':0.1} #Begin, End, Middle, Single
model = Model(S,observation,phi,trans_prob,conf_prob)
o_hstate = []

print("- Preprocess: len of observation:", len(observations), observations[0])
for obser in observations:
    '''
    Notice,if a setence is too long,when we use viterbi algorithm it may result in the beta = 0
    There are two solution,one is split the setence into serval sub_setence,another is use log function for the viterbi 
    here we select the first method
    '''
    length = len(obser)
    index,sub_obser,state= 0,[],[]
    # end of sentence -- we already break by sentence to new line, this is not necceary
    END_TOKENS = ['。', ',','៕','។','?',')',":","\""]
    while index < length:
        sub_obser.append(obser[index])
        if obser[index] in END_TOKENS: #obser[index] == '。' or obser[index]=='，':
            sub_state = model.decode(sub_obser)
            sub_obser = []
            state += sub_state
        elif index == length-1:
            sub_state = model.decode(sub_obser)
            sub_obser = []
            state += sub_state
        index += 1
    o_hstate.append(state)

word_sequence = pro._word_sequence(observations,o_hstate)
print("-word_sequence[0]", word_sequence[0])
print("0-hstates0", o_hstate[0])

process._statics:  khmer_seg_train.txt  total word count: 799  len hidden_state: 799
process._statics:  khmer_seg_train.txt  first word/line: ['ឃ', 'ា', 'ត', '់', 'ជ', 'ន', 'ស', 'ង', '្', 'ស', '័', 'យ', 'ម', '្', 'ន', 'ា', 'ក', '់', 'ប', 'ន', '្', 'ទ', 'ា', 'ប', '់', 'ព', 'ី', 'ធ', '្', 'វ', 'ើ', 'ស', 'ក', 'ម', '្', 'ម', 'ភ', 'ា', 'ព', 'ល', 'ួ', 'ច', 'យ', 'ក', 'ក', 'ា', 'ប', 'ូ', 'ប', 'ល', 'ុ', 'យ', 'ជ', 'ន', 'រ', 'ង', 'គ', '្', 'រ', 'ោ', 'ះ', 'ក', '្', 'ន', 'ុ', 'ង', 'ផ', '្', 'ស', 'ា', 'រ', 'ល', 'ើ', 'ធ', 'ំ', 'ថ', '្', 'ម', 'ី', 'ក', '្', 'រ', 'ុ', 'ង', 'ស', 'ៀ', 'ម', 'រ', 'ា', 'ប', 'ន', 'ិ', 'ង', 'ប', 'ា', 'ន', 'ដ', 'ក', 'ហ', 'ូ', 'ត', 'ប', 'ា', 'ន', 'ទ', '្', 'រ', 'ព', '្', 'យ', 'ស', 'ម', '្', 'ប', 'ត', '្', 'ត', 'ិ', 'ជ', 'ូ', 'ន', 'ជ', 'ន', 'រ', 'ង', 'គ', '្', 'រ', 'ោ', 'ះ', 'វ', 'ិ', 'ញ']  len hidden_state: 799
process._statics:  khmer_seg_test.txt  total word count: 183  len hidden_state: 183
process._statics:  khmer_seg_test.txt  first word/line: ['អ', 'ា', 'ម', 'េ', 'រ', 'ិ'

In [7]:
# compare to segmentation text
!grep "អាមេរិក ព្រម ផ្តល់" kh_data_100/3135*_seg.txt
#!head kh_data_100/313502_orig.txt
!head kh_data_100/313502_seg.txt


kh_data_100/313502_seg.txt: អាមេរិក ព្រម ផ្តល់ សេវា គាំទ្រ បច្ចេកទេស ដល់ យន្តហោះ ចម្បាំង F - 16 ប៉ាគីស្ថាន និង លក់ គ្រឿង បន្លាស់ យន្តហោះ យោធា ឲ្យ ឥណ្ឌា 27, Jul 2019 , 10:30 pm 
 អាមេរិក ព្រម ផ្តល់ សេវា គាំទ្រ បច្ចេកទេស ដល់ យន្តហោះ ចម្បាំង F - 16 ប៉ាគីស្ថាន និង លក់ គ្រឿង បន្លាស់ យន្តហោះ យោធា ឲ្យ ឥណ្ឌា 27, Jul 2019 , 10:30 pm 
រដ្ឋាភិបាល ក្រុង វ៉ាស៊ីនតោន បាន សម្រេច អនុម័ត លក់ គ្រឿង បន្លាស់ និង ផ្តល់ ការគាំទ្រ ផ្នែក បច្ចេកទេស សម្រាប់ យន្តហោះ យោធា F - 16 របស់ ប៉ាគីស្ថាន និង យន្តហោះ ដឹកជញ្ជូន C - 17 របស់ ឥណ្ឌា ។
 យន្តហោះ ចម្បាំង F - 16 របស់ ប៉ាគីស្ថាន និង យន្តហោះ ដឹកជញ្ជូន យោធា C - 17 របស់ ឥណ្ឌា នឹង ទទួលបាន ការ សេវា គាំទ្រ បច្ចេកទេស និង ភស្តុភារ ពីសំណាក់ រដ្ឋបាល អាមេរិក បន្ទាប់ពី ក្រសួង ការបរទេស អាមេរិក សម្រេច អនុម័ត គម្រោង មាន តម្លៃ ៨០០ លាន ដុល្លារ អាមេរិក នៅ ថ្ងៃសុក្រ ទី ២៦ ខែកក្កដា ។
 បើ តាម ទីភ្នាក់ងារ ព័ត៌មាន AFP អាមេរិក សម្រេច យល់ព្រម ផ្តល់ សេវា គាំទ្រ បច្ចេកទេស និង ភស្តុភារ ដល់ យន្តហោះ F - 16 ដែល ប៉ាគីស្ថាន បាន ទិញ ពី អាមេរិក ។
 ការសម្រេច នេះ  ធ្វើឡើង ត្រឹម ប៉ុន្មាន ថ្ងៃ ប៉ុណ្ណោះ ក្រ

# **Metric**

**Custom**

In [8]:
# Custom validation
#prediction = [1,0,0,1,1,1,1,0,1,0]
#correct    = [1,0,0,1,1,1,0,0,1,0]
prediction = [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0]
correct =    [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0]

pstr = "".join(str(i) for i in prediction)
cstr = "".join(str(i) for i in correct)
print("P",pstr)
print("C",cstr)
#pl = pstr.split('1')


def calc_perf(corrects, predictions): # list of 0/1
  tp = 0
  fp = 0
  fn = 0
  n_correct = 0
  n_incorrect = 0
  total_char = 0
  total_word = 0
  n_correct_word = 0
  
  print("size of input:", len(predictions), "ground truth:", len(corrects))
  if len(predictions) != len(corrects): return 0
  
  for i, prediction in enumerate(predictions):
    correct = corrects[i]
    zipped = list(zip(prediction, correct))    
    tp +=        len([1 for l, c in zipped if l == c and l == 1])
    fp +=        len([1 for l, c in zipped if l == 1 and c == 0])
    fn +=        len([1 for l, c in zipped if l == 0 and c == 1])
    n_incorrect += len([1 for l, c in zipped if l != c])
    n_correct   += len([1 for l, c in zipped if l == c])
    #n_correct_word += len([1 for l,c in zipped if l==1 and c==1]) # not account for prediction=1 and correct=0
    #n_incorrect_word += len([1 for l,c in zipped if l==0 and c==1]) # missing other way around
    total_word += len([_ for l in correct if l==1])
    total_char += len(prediction)
    #print("len correct", len(correct), " incorrect count:", n_incorrect)
    # count good word
    n_correct_word += count_correct_word(correct, prediction)
  
  print("Total char:", str(total_char), " total word:", str(total_word), "avg char/word:", str(total_char/total_word))
  print("Correct word:" + str(n_correct_word), " incorrect word:", str(total_word - n_correct_word), "word accuracy:", n_correct_word/total_word) 
  
  precision = tp/(tp+fp)
  recall = tp/(tp+fn)
  F1 = 2 * (precision * recall) / (precision + recall)
  print("Precision:\t" + str(precision), "tp:", tp, "fp:", fp)
  print("Recall:\t\t" + str(recall), "fn:",fn)
  print("F1-score:\t" + str(F1))
  print("Accuracy:\t" + str(n_correct/(n_correct+n_incorrect))) 
  
def count_correct_word(correct, prediction):
  s = ""
  for i in range(len(correct)):
    s += "%3s" %str(i)
  #print("prediction:", prediction)
  #print("   correct:", correct)
  #print("       str:", s)
  B=False
  correct_count = 0
  for i,c in enumerate(correct):
    p = prediction[i]
    nextc = -1
    if i < len(correct)-1: 
      nextc = correct[i+1]
    if c==1 and p==1:
      B = True
      correct_count += 1
      #print(i,"Begin word corect", correct_count)
    if p==0 and c==1 and B:
      B = False
      correct_count -= 1
      #print(i, "too long")
    if c==0 and p==1: #incorrect
      if B: 
        correct_count -= 1
        #print(i,"bad word", correct_count)
        B = False
  return correct_count

correct_count = count_correct_word(prediction, correct)
print("correct count", correct_count)  
calc_perf([correct], [prediction])

from sklearn.metrics import classification_report 
print(classification_report(correct, prediction, target_names=["0","1"]))

P 1000000100010000100010010010000010010010001001000000111110000100001001001000010000001000100100000010000011100100011111100
C 1000000100010000100010000010000000010010000001000000111010000000001001001000010000001000000100010010000100100100011000010
correct count 15
size of input: 1 ground truth: 1
Total char: 121  total word: 27 avg char/word: 4.481481481481482
Correct word:15  incorrect word: 12 word accuracy: 0.5555555555555556
Precision:	0.6666666666666666 tp: 24 fp: 12
Recall:		0.8888888888888888 fn: 3
F1-score:	0.761904761904762
Accuracy:	0.8760330578512396
              precision    recall  f1-score   support

           0       0.96      0.87      0.92        94
           1       0.67      0.89      0.76        27

    accuracy                           0.88       121
   macro avg       0.82      0.88      0.84       121
weighted avg       0.90      0.88      0.88       121



**Check Accuracy**

In [9]:
# Check accuracy
test_labels = [] # predicted result
g_labels = []    # ground thruth
for i, el in enumerate(test_states):
  test_label = []
  for c in el: #string
    v = 1 if c in "BS" else 0
    test_label.append(v)
  g_label = []
  for c in o_hstate[i]:
    v = 1 if c in "BS" else 0
    g_label.append(v)
  if len(test_label) != len(g_label):
    print(i,"test_label len:", len(test_label), test_label)
    print(i,"   g_label len:", len(g_label), g_label)
    print("--- Not matching length ---- observation:", observations[i])
  test_labels.append(test_label)
  g_labels.append(g_label)
  
# check custom metric
#calc_perf(test_labels[0:1], g_labels[0:1])
calc_perf(test_labels, g_labels)

flat_predicts = [item for t in test_labels for item in t]
flat_true = [item for t in g_labels for item in t]


from sklearn.metrics import classification_report 
print(classification_report(flat_predicts, flat_true, 
      target_names=["0","1"]))

size of input: 183 ground truth: 183
Total char: 32179  total word: 7072 avg char/word: 4.550197963800905
Correct word:3099  incorrect word: 3973 word accuracy: 0.43820701357466063
Precision:	0.49704092689839124 tp: 5963 fp: 6034
Recall:		0.8431843891402715 fn: 1109
F1-score:	0.625412973936756
Accuracy:	0.7780229342117531
              precision    recall  f1-score   support

           0       0.95      0.76      0.84     25107
           1       0.50      0.84      0.63      7072

    accuracy                           0.78     32179
   macro avg       0.72      0.80      0.73     32179
weighted avg       0.85      0.78      0.79     32179

