In [17]:
import nltk
from numpy import array, ones, zeros, multiply
import numpy as np
import sys
from pprint import pprint
import operator

epsilon=1e-1

class HMM:
        def __init__(self, state_list, observation_list,
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None, smoothing_obs = 0.01):
            print "HMM creating with: "
            self.N = len(state_list)       # number of states
            self.M = len(observation_list) # number of possible emissions
            print str(self.N)+" states"
            print str(self.M)+" observations"
            self.omega_Y = state_list
            self.omega_X = observation_list
            if transition_proba is None:
                self.transition_proba = zeros( (self.N, self.N), float) 
            else:
                self.transition_proba=transition_proba
            if observation_proba is None:
                self.observation_proba = zeros( (self.M, self.N), float) 
            else:
                self.observation_proba=observation_proba
            if initial_state_proba is None:
                self.initial_state_proba = zeros( (self.N,), float ) 
            else:
                self.initial_state_proba=initial_state_proba
            self.make_indexes() # build indexes, i.e the mapping between token and int
            self.smoothing_obs = smoothing_obs 
            
        def make_indexes(self):
            """Creates the reverse table that maps states/observations names
            to their index in the probabilities array"""
            self.Y_index = {}
            for i in range(self.N):
                self.Y_index[self.omega_Y[i]] = i
            self.X_index = {}
            for i in range(self.M):
                self.X_index[self.omega_X[i]] = i
      
        def get_observationIndices( self, observations ):
            """return observation indices, i.e 
            return [self.O_index[o] for o in observations]
            and deals with OOVs
            """
            indices = zeros( len(observations), int )
            k = 0
            for o in observations:
                if o in self.X_index:
                    indices[k] = self.X_index[o]
                else:
                    raise("Erreur")
                k += 1
            return indices

    
        def data2indices(self, sent): 
            """From one tagged sentence of the brown corpus: 
            - extract the words and tags 
            - returns two list of indices, one for each
            -> (wordids, tagids)
            """
            wordids = list()
            tagids  = list()
            for couple in sent:
                wrd = couple[0]
                tag = couple[1]
                if wrd in self.X_index:
                    wordids.append(self.X_index[wrd])
                else:
                    raise("Erreur data2indices")
                tagids.append(self.Y_index[tag])
            return wordids,tagids
            
        def observation_estimation(self, pair_counts):
            """ Build the observation distribution: 
                observation_proba is the observation probablility matrix
                    [b_ki],  b_ki = Pr(X_t=v_k|Y_t=q_i)"""
            # fill with counts
            for pair in pair_counts:
                wrd=pair[0]
                tag=pair[1]
                cpt=pair_counts[pair]
                k = 0 # for <unk>
                if wrd in self.X_index: 
                    k=self.X_index[wrd]
                i=self.Y_index[tag]
                self.observation_proba[k,i]=cpt
            # normalize
            self.observation_proba=self.observation_proba+self.smoothing_obs
            self.observation_proba=self.observation_proba/self.observation_proba.sum(axis=0).reshape(1,self.N)
            
        
        def transition_estimation(self, trans_counts):
            """ Build the transition distribution: 
                transition_proba is the transition matrix with : 
                [a_ij] a[i,j] = Pr(Y_(t+1)=q_i|Y_t=q_j)
            """
            # fill with counts
            for pair in trans_counts:
                i=self.Y_index[pair[1]]
                j=self.Y_index[pair[0]]
                self.transition_proba[j,i]=trans_counts[pair]
            # normalize
            self.transition_proba=self.transition_proba/self.transition_proba.sum(axis=0).reshape(1,self.N)
        
        def init_estimation(self, init_counts):
            """Build the init. distribution"""
            # fill with counts
            for tag in init_counts:
                i=self.Y_index[tag]
                self.initial_state_proba[i]=init_counts[tag]
            # normalize
            self.initial_state_proba=self.initial_state_proba/sum(self.initial_state_proba)
             
        
        def supervised_training(self, pair_counts, trans_counts,init_counts):
            """ Train the HMM's parameters. This function wraps everything"""
            self.observation_estimation(pair_counts)
            self.transition_estimation(trans_counts)
            self.init_estimation(init_counts)
        
        def viverbit(self,mots):
            alpha = np.zeros((self.N,len(mots)))
            xi = np.zeros((self.N,len(mots)))
            #init
            #print(self.observation_estimation)
            i = 0
            if mots[0] in self.X_index:
                i = self.X_index[mots[0]]
            alpha[:,0] = self.initial_state_proba*self.observation_proba[i]
            print(alpha[:,0])
            print(self.omega_Y)
            for i in range(1,len(mots)):
                for j in range(self.N):
                    #self.observation_proba : mot puis j : type
                    indi = 0
                    if mots[i] in self.X_index:
                        indi = self.X_index[mots[i]]
                    liste = [alpha[k,i-1] * self.transition_proba[k,j]* self.observation_proba[indi,j] for k in range(self.N)]
                    alpha[j,i] = np.max(liste)
                    xi[j,i] = np.argmax(liste)
            tags = []
            debut = len(xi)
            starting = np.argmax(alpha[:,len(xi[0])-1])
            tags.append(starting)
            count = len(xi[0])
            '''print(alpha)
            print(xi)'''
            while len(tags) != len(mots):
                count -= 1
                new_index = xi[starting,count]
                tags.append(new_index)
                starting = new_index
            tags = tags[::-1]
            to_return  = []
            count = 0
            for i in tags:
                to_return.append((mots[count],self.Y_index.keys()[self.Y_index.values().index(i)]))
                count +=1
            return to_return
                 
        def evaluate(self,test_data):
            errors = 0
            total = 0
            erreur_false = 0
            total_false = 0
            erreur_2 = 0
            
            correction = 0
            correction_totale = 0
            for i in range(len(test_data)):
                res = self.viverbit(map(operator.itemgetter(0), test_data[i]))
                if sum([a!=b for a,b in test_data[i]])>0:
                    total_false +=1 
                    erreur_false += sum([a[1]!=b[1] for a,b in zip(res,test_data[i])])
                for a,b in zip(res,test_data[i]):
                    if(b[0] != b[1] and b[1]==a[1]):
                        correction +=1
                    correction_totale +=1
                erreur_2 += sum([a!=b for a,b in test_data[i]])
                errors += sum([a[1]!=b[1] for a,b in zip(res,test_data[i])])
                total += len(res)
            print("Percentage of errors : " ,  (errors/float(total))*100.0)
            print("Pourcentage de correction  : ", (erreur_false/float(total_false)))
            print("Taux d erreur brut " , ((erreur_2/float(total))*100.0))
            print(total)
            print("taux correction calcul  2 " , correction, ((correction/float(correction_totale))*100.0),correction_totale)
            
hmm = HMM(state_list=ctags.keys(), observation_list=vocab,
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None,
                 smoothing_obs = 1e-2)
hmm.supervised_training(cpairs,ctrans,cinits)
#print("test")
#print(test[1])
print("test")
for i in range(1):
    print(test[i])
    print(hmm.viverbit(map(operator.itemgetter(0), test[i])))
#hmm.evaluate(test)
my_test = [('s', 's'), ('e', 'e'), ('z', 'e'), ('m', 'm'), ('s', 's')]
print(my_test)
print(hmm.viverbit(map(operator.itemgetter(0), my_test)))

HMM creating with: 
26 states
26 observations
test
[('t', 't'), ('h', 'h'), ('e', 'e')]
[  1.00798608e-07   8.66057353e-08   2.37550837e-07   1.67402941e-08
   6.46430576e-08   3.32860393e-04   6.74707901e-04   7.37449284e-08
   5.39662390e-08   4.60609578e-08   1.71662314e-07   1.06804460e-07
   3.61995660e-08   6.60605875e-08   2.66430246e-08   7.78726151e-08
   1.64199355e-07   7.65700053e-08   6.54787035e-04   2.52121537e-08
   1.57297641e-01   2.44227735e-07   2.42855435e-08   6.48439582e-05
   3.76450614e-09   0.00000000e+00]
['a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z']
[('t', 't'), ('h', 'h'), ('e', 'e')]
[('s', 's'), ('e', 'e'), ('z', 'e'), ('m', 'm'), ('s', 's')]
[  3.55829167e-03   8.66057353e-08   2.37550837e-07   1.67402941e-08
   6.40030914e-04   6.16294007e-08   1.05406640e-07   7.37449284e-08
   5.39662390e-08   4.60609578e-08   1.71662314e-07   1.06804460e-07
   3.61995660e-08   6.6060



# Compter les mots et les tags

In [2]:
def make_counts(corpus):
    """ 
    Build different count tables to train a HMM. Each count table is a dictionnary. 
    Returns: 
    * c_words: word counts
    * c_tags: tag counts
    * c_pairs: count of pairs (word,tag)
    * c_transitions: count of tag bigram 
    * c_inits: count of tag found in the first position
    """
    c_words = dict()
    c_tags = dict()
    c_pairs= dict()
    c_transitions = dict()
    c_inits = dict()
    for sent in corpus:
        # we use i because of the transition counts
        for i in range(len(sent)):
            couple=sent[i]
            wrd = couple[0]
            tag = couple[1]
            # word counts
            if wrd in c_words:
                c_words[wrd]=c_words[wrd]+1
            else:
                c_words[wrd]=1
            # tag counts
            if tag in c_tags:
                c_tags[tag]=c_tags[tag]+1
            else:
                c_tags[tag]=1
            # observation counts
            if couple in c_pairs:
                c_pairs[couple]=c_pairs[couple]+1
            else:
                c_pairs[couple]=1
            # i >  0 -> transition counts
            if i > 0:
                trans = (sent[i-1][1],tag)
                if trans in c_transitions:
                    c_transitions[trans]=c_transitions[trans]+1
                else:
                    c_transitions[trans]=1
            # i == 0 -> counts for initial states
            else:
                if tag in c_inits:
                    c_inits[tag]=c_inits[tag]+1
                else:
                    c_inits[tag]=1
                    
    return c_words,c_tags,c_pairs, c_transitions, c_inits


# Création du vocabulaire (filtrage selon le nombre d'occurence)

In [3]:
def make_vocab(c_words, threshold):
    """ 
    return a vocabulary by thresholding word counts. 
    inputs: 
    * c_words : a dictionnary that maps word to its counts
    * threshold: count must be >= to the threshold to be included
    
    returns: 
    * a word list
    """
    voc = list()
    for w in c_words:
        if c_words[w] >= threshold:
            voc.append(w)
    return voc


# les données


In [4]:
import cPickle

with open("typos-data/test10.pkl", "rb") as input_file:
    test = cPickle.load(input_file)
    
with open("typos-data/train10.pkl", "rb") as input_file:
    train = cPickle.load(input_file)
print "Nombre de phrases de train = "+str(len(train))
print "Nombre de phrases de test  = "+str(len(test))

Nombre de phrases de train = 29057
Nombre de phrases de test  = 1501


In [5]:
cwords,ctags,cpairs,ctrans,cinits = make_counts(train)
print "Nombre de mots  : "+str(len(cwords))
print "Nombre de tags  : "+str(len(ctags))
print "Nombre de paires: "+str(len(cpairs))
print "Nombre de trans : "+str(len(ctrans))+ " / "+ str(12*12)
print "Nombre de init. : "+str(len(cinits))
vocab = make_vocab(cwords,10)
print "Vocabulaire :"+str(len(vocab))

Nombre de mots  : 26
Nombre de tags  : 26
Nombre de paires: 127
Nombre de trans : 403 / 144
Nombre de init. : 25
Vocabulaire :26


# Création du HMM

In [7]:
hmm = HMM(state_list=ctags.keys(), observation_list=vocab,
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None)

HMM creating with: 
12 states
7991 observations


# Apprentissage pas à pas 

In [8]:
hmm.observation_estimation(cpairs)
print hmm.observation_proba.sum(axis=0)


[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]


In [9]:
hmm.transition_estimation(ctrans)
print hmm.transition_proba.sum(axis=0)

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]


In [10]:
hmm.init_estimation(cinits)
print sum(hmm.initial_state_proba)

1.0


# Apprentissage en une fois

In [11]:
hmm = HMM(state_list=ctags.keys(), observation_list=vocab,
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None,
                 smoothing_obs = 0.001)
hmm.supervised_training(cpairs,ctrans,cinits)

HMM creating with: 
12 states
7991 observations


In [None]:

'''
(u'Some', u'DET'), (u'ten', u'NUM'), (u'years', u'NOUN'), (u'ago', u'ADV'), (u'that', u'DET'),
(u'page', u'NOUN'), (u'was', u'VERB'), (u'torn', u'VERB'), (u'out', u'PRT'), (u',', u'.'), 
(u'I', u'PRON'), (u"don't", u'VERB'), (u'know', u'VERB'), (u'by', u'ADP'), (u'whom', u'PRON'), (u'.', u'.')]
'''


'''
Projet : variable n est plus mot mais caractere
observer un mot => sequence d etats la version correcte
Xik = convluwion
Yik = conclusion

pour que ce soit plus dur => modele de second ordre

donnees (bon char, mauvais char)

 deux version : 10 : 10% d erreurs
                 20 : 20%  d erreurs
                 
3 points : hmm et biterdi : taux d erreurs du modele

modele markov ordre deux : => proba  d un etat depend des deux etats precedents P(Yt|Yt-1,Yt2)

dans viberti : on propage (dans delta) la proba du meilleur chemin pour arriver a un instant t dans un etat donne
=> quand rempli t on peut remplir t+1 (par hypothese) markov d ordre 1 => donc a ordre 2 il faut faire changement de varibale
besoin de garder la trace de tout ce qui est contionnement (Yt garde trace de ce qui est en t-1 et t-2 )
taille devient s² => delta de taille (K*N^2)

3 novembre : finir partie 1
'''