In [1]:
import numpy as np
import nltk
import sys
import operator

In [2]:
UNK = "<unk>"  # token to map all out-of-vocabulary words (OOVs)
UNKid = 0      # index for UNK
epsilon=1e-100
special = "<s>"
smoothing_trans = 1e-2
smoothing_trans_2 = 1e-5

In [3]:
class HMM:
        def __init__(self, state_list, observation_list,
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None,
                    transition_proba_2 = None,smoothing_obs = 0.01):
            print "HMM creating with: "
            self.N = len(state_list)       # number of states
            self.M = len(observation_list) # number of possible emissions
            
            #les differents labels
            self.omega_Y = state_list
            #les differents mots
            self.omega_X = observation_list
            #caractere special
            #symbolise le debut d'un mot
            self.omega_Y.append("@")
            
            #creation des transitions
            if transition_proba is None:
                self.transition_proba = np.zeros( (self.N+1, self.N+1,self.N), float) 
            else:
                self.transition_proba=transition_proba
            
            #creation des observations
            if observation_proba is None:
                self.observation_proba = np.zeros( (self.M, self.N), float) 
            else:
                self.observation_proba=observation_proba
            self.make_indexes() # build indexes, i.e the mapping between token and int
            self.smoothing_obs = smoothing_obs 
            
        def make_indexes(self):
            """Creates the reverse table that maps states/observations names
            to their index in the probabilities array"""
            #index des labels 
            self.Y_index = {}
            for i in range(self.N+1):
                self.Y_index[self.omega_Y[i]] =i
                
            #index des lettres
            self.X_index = {}
            for i in range(self.M):
                self.X_index[self.omega_X[i]] = i
      
        def get_observationIndices( self, observations ):
            """return observation indices, i.e 
            return [self.O_index[o] for o in observations]
            and deals with OOVs
            """
            indices = zeros( len(observations), int )
            k = 0
            for o in observations:
                if o in self.X_index:
                    indices[k] = self.X_index[o]
                else:
                    indices[k] = UNKid
                k += 1
            return indices

    
        def data2indices(self, sent): 
            #associe a chaque caractere et label un unique identifiant
            #par exemple : 'a' : 1
            wordids = list()
            tagids  = list()
            for couple in sent:
                wrd = couple[0]
                tag = couple[1]
                if wrd in self.X_index:
                    wordids.append(self.X_index[wrd])
                else:
                    wordids.append(UNKid)
                tagids.append(self.Y_index[tag])
            return wordids,tagids
            
        def observation_estimation(self, pair_counts):
            # calcul la probabilite d'observer une classe pour un caractere
            #par exemple la classe 'y' pour le caractere 'c'
            for pair in pair_counts:
                wrd=pair[0]
                tag=pair[1]
                cpt=pair_counts[pair]
                k = 0 # for <unk>
                if wrd in self.X_index: 
                    k=self.X_index[wrd]
                i = self.Y_index[tag]
                self.observation_proba[k,i]=cpt
            # normalize
            self.observation_proba=self.observation_proba+self.smoothing_obs
            self.observation_proba=self.observation_proba/self.observation_proba.sum(axis=0).reshape(1,self.N)
            
        
        def transition_estimation(self, trans_counts,trans_count_bi):
            #calcule la probabilite de passer dans un etat en conaissant les deux derniers etats
            #par exemple (a,b)=> b
            for pair in trans_counts:
                #l'etat courant a t
                i=self.Y_index[pair[1]]
                #l'etat a t-2
                j=self.Y_index[pair[0][0]]
                #l'etat a t-1
                k=self.Y_index[pair[0][1]]
                #normalisation
                self.transition_proba[j,k,i]=float(trans_counts[pair])/float(trans_count_bi[pair[0]])

        def init_estimation(self, init_counts,init_counts2):
            #comptes d'initialisation
            
            #indice du caractere special
            indice_empty = self.Y_index["@"]
            for init in init_counts:
                i=self.Y_index[init]
                #cas du tout premier caractere des mots, comme pas de passé
                #on note le passe (@,@)
                self.transition_proba[indice_empty,indice_empty,i]=float(init_counts[init])/float(sum(init_counts.values()))
            
            #cas du second caractere des mots
            for p in init_counts2:
                old=self.Y_index[p[0]]
                new=self.Y_index[p[1]]
                #cas du tout premier caractere des mots, comme un seul passé
                #on note le passe (@,@)
                self.transition_proba[indice_empty,old,new]=float(init_counts2[p])/float(init_counts[p[0]])
        
        def supervised_training(self, cpairs, ctrans, ctrans2 ,cinits, cinits2):
            """ Train the HMM's parameters. This function wraps everything"""
            self.observation_estimation(cpairs)
            self.transition_estimation(ctrans,ctrans2)
            self.init_estimation(cinits,cinits2)
            #indices pour eviter de calculer des transitions a zeros
            self.solo = np.zeros( (self.N,), float ) 
            for tags in ctags:
                self.solo[self.Y_index[tags]] = ctags[tags]
            self.solo=self.solo/sum(self.solo)
            print(self.solo)
            #contient pour chaque position dans alpha_2 les deux caracteres associees
            #par exemple : (a,b) : 2
            self.Y_index_2 = {}
            count = 0 
            for e in self.omega_Y:
                for z in self.omega_Y:
                    self.Y_index_2[(e,z)]= count
                    count +=1
                
        def get_trans(self,k,indi_2):
            #indi_2 : vers nouvel etat
            # k :ancien etat
            return  self.transition_proba[k,indi_2]
    
        def find_indices(self,k):
            #renvoie les indices a parcourir en fonction de la position dans le mot
            #si premier ou deuxieme caracter: le passe est limite et on renvoie @
            #sinon le passe peut etre l'ensemble des 26 labels
            if k == 0 or k==-1:
                return set(['@'])
            return self.omega_Y[0:26]
        
        
        def viverbit(self,mots):
            #algo de viterbi
            if len(mots) <=1:
                #traite le mot est compose d un seul caractere
                return 0,mots
            #contient le chemin qui a permis d'arriver dans chaque etat
            #au debut, pas de passé donc seul moyen d'arriver dans un etat est de passer
            #par les etats @ et @
            path_to = {('@','@') : []}
            #contient les probabs associes a chaque etat pour chaque mot
            alpha_2 = np.zeros((self.N+1,self.N+1,len(mots)+1))
            '''for i in range(self.N+1):
                alpha_2[:,i,0] = np.ones(self.N+1)'''
            #intialisation pour le tout premier caracteres (@)
            alpha_2[:,:,0] = np.ones((self.N+1,self.N+1))
            #pour chaque lettre (+1) car on commence par @
            for j in range(1,len(mots)+1):
                temp_path = {}
                index = 0
                #cas qui correspond a juste avant la premiere lettre
                if j==0:
                    index = self.X_index['@']
                else:
                    index = self.X_index[mots[j-1]]
                
                #trouve les caracteres possibles pour la position (j-1) : (-1) a cause du premier caractere @
                indices = self.find_indices(j-1)
                for i in indices:
                    corr_indices = self.find_indices(j)
                    indice_i = self.Y_index[i]
                    
                    #boucle sur tous les etats possibles pour le caractere i
                    for co in corr_indices:
                        indice_j= self.Y_index[co]
                        valeur_max = -10000000
                        back = 0
                        #pour chaque etat, calcule la probabilite en fonction des deux derniers etat
                        for t in self.find_indices(j-2):
                            tmp = alpha_2[self.Y_index[t],self.Y_index[i],j-1] * self.transition_proba[self.Y_index[t],indice_i,indice_j] * self.observation_proba[index,indice_j]
                            if tmp > valeur_max:
                                #calcule de la valeur max
                                valeur_max = tmp
                                back = t
                        #sauvegarde du max et du chemin maximisant pour arriver dans cet etat
                        alpha_2[self.Y_index[i],self.Y_index[co],j] = valeur_max
                        temp_path[i,co] = path_to[back,i] + [co]
                path_to = temp_path
            #backtrack
            #recuperation de la proba maximale
            proba = np.max(alpha_2[:,:,len(mots)])
            #recuperation du couple associe a la valeur maximale
            chemin_max = np.argmax(alpha_2[:,:,len(mots)])
            #recuperation du chemin
            i,co = self.Y_index_2.keys()[self.Y_index_2.values().index(chemin_max)]
            return proba,path_to[i,co]
                 
        def evaluate(self,test_data):
            #fonction d'evaluation
            #calcul differentes valeurs comme le taux de correction
            #le taux d'erreurs...
            
            #taux d'erreur brut
            errors = 0
            #nombre total de caracteres
            total = 0
            #taux d'erreur sur l'ensemble de test
            erreur_2 = 0
            
            #nombre de correction
            correction = 0
            correction_totale = 0
            for i in range(len(test_data)):
                if i%100==0:
                    print("Remaining : {}".format(len(test_data)-i))
                p,res = self.viverbit(map(operator.itemgetter(0), test_data[i]))
                for a,b in zip(res,test_data[i]):
                    if(b[1]!=b[0] and b[1] ==a):
                        correction +=1
                    correction_totale +=1
                erreur_2 += sum([a!=b for a,b in test_data[i]])
                errors += sum([a!=b[1] for a,b in zip(res,test_data[i])])
                total += len(res)
            print("################# Resultats du HMM d'ordre 1 #################")
            print("Percentage of errors : {0:.2f}%".format(((errors/float(total))*100.0)))
            print("Taux d erreur brut : {0:.2f}%".format(((erreur_2/float(total))*100.0)))
            print("Taux de correction : {0:.2f}%".format(((correction/float(correction_totale))*100.0)))
            print("Nombre de corrections {} vs nombre de bonnes corrections {}".format(correction_totale,correction))
            

In [4]:
def make_counts(corpus):
    """ 
    Build different count tables to train a HMM. Each count table is a dictionnary. 
    Returns: 
    * c_words: word counts
    * c_tags: tag counts
    * c_pairs: count of pairs (word,tag)
    * c_transitions: count of tag bigram 
    * c_inits: count of tag found in the first position
    """
    c_words = dict()
    c_tags = dict()
    c_pairs= dict()
    c_transitions = dict()
    c_transitions2 = dict()
    c_inits = dict()
    c_inits2 = dict()
    for sent in corpus:
        # we use i because of the transition counts
        for i in range(len(sent)):
            couple=sent[i]
            wrd = couple[0]
            tag = couple[1]
            # word counts
            if wrd in c_words:
                c_words[wrd]=c_words[wrd]+1
            else:
                c_words[wrd]=1
            # tag counts
            if tag in c_tags:
                c_tags[tag]=c_tags[tag]+1
            else:
                c_tags[tag]=1
            # observation counts
            if couple in c_pairs:
                c_pairs[couple]=c_pairs[couple]+1
            else:
                c_pairs[couple]=1
            # i >  0 -> transition counts
            if i > 0:
                trans = (sent[i-1][1],tag)
                if trans in c_transitions2:
                    c_transitions2[trans]=c_transitions2[trans]+1
                else:
                    c_transitions2[trans]=1
            if i > 1:
                trans = ((sent[i-2][1],sent[i-1][1]),tag)
                if trans in c_transitions:
                    c_transitions[trans]=c_transitions[trans]+1
                else:
                    c_transitions[trans]=1
            # i == 0 -> counts for initial states
            if i==0:
                if tag in c_inits:
                    c_inits[tag]=c_inits[tag]+1
                else:
                    c_inits[tag]=1
            if i == 1:
                cle = (sent[i-1][1],tag)
                if cle in c_inits2:
                    c_inits2[cle] +=1
                else:
                    c_inits2[cle]=1
                    
    return c_words,c_tags,c_pairs, c_transitions, c_inits, c_transitions2,c_inits2
def make_vocab(c_words, threshold):
    """ 
    return a vocabulary by thresholding word counts. 
    inputs: 
    * c_words : a dictionnary that maps word to its counts
    * threshold: count must be >= to the threshold to be included
    
    returns: 
    * a word list
    """
    voc = list()
    for w in c_words:
        if c_words[w] >= threshold:
            voc.append(w)
    return voc
import cPickle
from pprint import pprint

with open("typos-data/test10.pkl", "rb") as input_file:
    test = cPickle.load(input_file)
    
with open("typos-data/train20.pkl", "rb") as input_file:
    train = cPickle.load(input_file)
print "Nombre de phrases de train = "+str(len(train))
print "Nombre de phrases de test  = "+str(len(test))
cwords,ctags,cpairs,ctrans,cinits, ctrans2,cinits2 = make_counts(train)
print "Nombre de mots  : "+str(len(cwords))
print "Nombre de tags  : "+str(len(ctags))
print "Nombre de paires: "+str(len(cpairs))
print "Nombre de trans : "+str(len(ctrans))+ " / "+ str(12*12)
print "Nombre de init. : "+str(len(cinits))
vocab = make_vocab(cwords,10)
print "Vocabulaire :"+str(len(vocab))

Nombre de phrases de train = 27184
Nombre de phrases de test  = 1501
Nombre de mots  : 26
Nombre de tags  : 26
Nombre de paires: 128
Nombre de trans : 2464 / 144
Nombre de init. : 25
Vocabulaire :26


In [5]:
print("Creation du hmm")
hmm = HMM(state_list=ctags.keys(), observation_list=cwords.keys(),
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None,
                 smoothing_obs = 0.001)
hmm.supervised_training( cpairs, ctrans, ctrans2 ,cinits, cinits2)

Creation du hmm
HMM creating with: 
[ 0.07384321  0.03371526  0.0145519   0.12630328  0.03187665  0.01905125
  0.02319932  0.07621247  0.04683214  0.00417797  0.00073993  0.02648041
  0.04457499  0.08374627  0.06842455  0.00106878  0.02274341  0.06780421
  0.05777409  0.027751    0.09666136  0.01554594  0.01339342  0.02071048
  0.00194324  0.00087446]


In [6]:
#calcul du taux d'erreur
hmm.evaluate(test)

Remaining : 1501
Remaining : 1401
Remaining : 1301
Remaining : 1201
Remaining : 1101
Remaining : 1001
Remaining : 901
Remaining : 801
Remaining : 701
Remaining : 601
Remaining : 501
Remaining : 401
Remaining : 301
Remaining : 201
Remaining : 101
Remaining : 1
################# Resultats du HMM d'ordre 1 #################
Percentage of errors : 4.25%
Taux d erreur brut : 10.18%
Taux de correction : 7.70%
Nombre de corrections 7320 vs nombre de bonnes corrections 564


In [7]:
print("Exemple d'appel de viterbi pour les 10 premieres donnees de test : ")
print("###############################################################")
for i in range(10):
    entree = "".join([a[0] for a in test[i]])
    proba,solution = hmm.viverbit(entree)
    solution = "".join(solution)
    print("{} ------------------> {} & {}".format(entree,solution,proba))

Exemple d'appel de viterbi pour les 10 premieres donnees de test : 
###############################################################
the ------------------> the & 0.0371518198204
leftist ------------------> leftist & 1.93716052036e-06
is ------------------> is & 0.0119598510127
too ------------------> too & 0.000663231296692
far ------------------> far & 0.000513967787936
gone ------------------> gone & 6.1260238484e-06
for ------------------> for & 0.00777076665394
that ------------------> that & 0.00377257851937
his ------------------> his & 0.000965235466778
reeljhgs ------------------> reelings & 5.21631519622e-12
