![UNIR](https://www.unir.net/wp-content/uploads/2014/10/logo.png)
### Máster en Inteligencia Artificial. 
**Asignatura:** _Procesamiento del Lenguaje Natural_

**Alumnos:** _Luisa Sánchez, Laia Garriga, Sergio Merino, Miguel Á. de Frutos_

**Fecha:** _26 Abril 2020_

---

# ACTIVIDAD 1: Etiquetado morfosintáctico. 

**Objetivo:** Realizar el etiquetado morfosintáctico de una oración aplicando la teoría de los modelos ocultos de Markov.
___

# Definición del corpus

In [1]:
import sys
import numpy as np
import pandas as pd
import operator as op

In [2]:
words_dict = dict()        # Contiene todas las palabras y el numero de veces que aparecen
tags_dict = dict()         # Contiene las etiquetas y el numero de ocurrencias de cada una
lineList = []              # Cada una de las lineas del corpus
transitions_dict = dict()  # Contiene todas las transiciones del corpus y su numero de ocurrencias
words_tags_dict = dict()   # Contiene todas las combinaciones de palabra + etiqueta y su numero de ocurrencias

In [3]:
# Leemos el archivo
corpus = 'mia07_t3_ag_Corpus-tagged.txt'
corpusFile = open(corpus, 'r', encoding="utf-8")

for line in corpusFile:
    if line.find('<doc') and line.find('</doc>'):
        lineList.append(line.split())
corpusFile.close()

# Parte 1: Etiquetador Morfosintáctico

In [4]:
# Sacamos las etiquetas y las palabras que aparecen
tags_dict['<S>'] = 1

for words in lineList:
    if not words:
        prevTag = '<S>'
        
    else: 
        tag = words[2]
        word = words[0].lower()
        
        # Etiquetas
        if(tag in tags_dict):
            # Por cada punto y final hay un inicio de frase
            if('Fp' == tag ): 
                tags_dict['<S>'] = tags_dict['<S>'] + 1
            tags_dict[tag] = tags_dict[tag] + 1
        else:
            tags_dict[tag] = 1
            
        # Palabras
        if(word in words_dict):
            words_dict[word] = words_dict[word] + 1
        else:
            words_dict[word] = 1
      
        # Recuento de las transiciones: Cada vez que una etiqueta determinada es precedida de otra
        transition = tag + '|' + prevTag
        if(transition in transitions_dict):
            transitions_dict[transition] = transitions_dict[transition]+1
        else:
            transitions_dict[transition] = 1
        
        # Recuento de las observaciones: Cada vez que estando en una etiqueta observo una palabra en concreto
        observation = word + '|' + tag
        if(observation in words_tags_dict):
            words_tags_dict[observation] = words_tags_dict[observation] + 1
        else:
            words_tags_dict[observation] = 1
        
        prevTag = tag      

In [5]:
print ('Etiquetas: ' , len(tags_dict) , 'en total')
print ('Palabras: ' , len(words_dict) , 'distintas')
print ('Transiciones: ' , len(transitions_dict) , 'en total')
print ('Palabras etiquetadas de manera única: ' , len(words_tags_dict) , 'en total')

Etiquetas:  69 en total
Palabras:  235 distintas
Transiciones:  288 en total
Palabras etiquetadas de manera única:  242 en total


### Matriz de Transición

In [6]:
# P{etiqueta_i+1 | etiqueta_i}

transition_prob_matrix = {}
# Calculamos la matriz de transicion
for key in transitions_dict:
    tags_counter = tags_dict[key.split('|')[1]]
    prob = transitions_dict[key] / tags_counter
    transition_prob_matrix[key] = prob

print('Hay ', len(transition_prob_matrix), 'valores en la matriz que no son 0')

Hay  288 valores en la matriz que no son 0


In [7]:
# Visualizacion de la Matriz de Transicion (DataFrame)

transition_matrix_df = pd.DataFrame(columns = ['Tag'])         

for tag_prev in tags_dict:
    row = {'Tag': tag_prev}
    for tag in tags_dict:
        key = tag + '|' + tag_prev
        if key in transition_prob_matrix:
            row[tag] = transition_prob_matrix[key]
        else:
            row[tag] = 0
    transition_matrix_df = transition_matrix_df.append(row, ignore_index = True)
    
transition_matrix_df = transition_matrix_df.fillna(0)
transition_matrix_df.head(10)    

Unnamed: 0,Tag,<S>,AO0MS0,AQ0CS0,AQ0FS0,AQ0MP0,AQ0MS0,CC,CS,DA0FS0,...,VMP00SM,VMSI1S0,VMSP1S0,VSII1S0,VSIP3S0,VSIS3S0,VSN0000,VSSP1S0,Z,Zu
0,<S>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0
1,NP00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0
2,VSIP3S0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DI0FS0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,NCFS000,0.0,0.0,0.0625,0.0625,0.0,0.0,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,SPS00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.102941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.014706,0.0,0.014706,0.0
6,DA0MS0,0.0,0.0,0.04,0.0,0.0,0.04,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,NCMS000,0.0,0.0,0.108108,0.0,0.0,0.027027,0.054054,0.0,0.0,...,0.027027,0.0,0.027027,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,AQ0MS0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,...,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,VMP00SM,0.0,0.0,0.0,0.0,0.0,0.111111,0.222222,0.111111,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0


### Matriz de Emisión

In [8]:
# P{Palabra | etiqueta}

emision_prob_matrix = {}
# Calculamos la matriz de emision/ observacion
for key in words_tags_dict:
    words_counter = tags_dict[key.split('|')[1]]
    prob = words_tags_dict[key] / words_counter
    emision_prob_matrix[key] = prob

#print(emision_prob_matrix)
print('Hay ', len(emision_prob_matrix), 'observaciones que no son 0')

Hay  242 observaciones que no son 0


In [9]:
# Visualizacion de la Matriz de Emision/Observacion (DataFrame)

emision_matrix_df = pd.DataFrame(columns = ['Tag'])     

for tag in tags_dict:
    row = {'Tag': tag}
    for word in words_dict:
        key = word + '|' + tag
        if key in emision_prob_matrix:
            row[word] = emision_prob_matrix[key]
        else:
            row[word] = 0

    emision_matrix_df = emision_matrix_df.append(row, ignore_index = True)

emision_matrix_df = emision_matrix_df.fillna(0)
emision_matrix_df.head(10)  

Unnamed: 0,Tag,"""",(,),",",.,1970,2,;,a,...,un,una,veces,venir,viaje,virus,voz,y,young,él
0,<S>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,NP00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0
2,VSIP3S0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DI0FS0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,NCFS000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0
5,SPS00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.205882,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,DA0MS0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,NCMS000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.027027,0.0,0.0,0.0,0.0,0.0
8,AQ0MS0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,VMP00SM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Parte 2: Etiquetado morfosintáctico de una oración

In [10]:

def max_prev_column(prev_column, current_tag):
    '''
    Devuelve el el par {etiqueta, valor} con valor maximo de la columna anterior prev_column
    
    Parameters
    ----------
    prev_column : dict
        Columna anterior P{tag|value}
    current_tag : str
        Etiqueta en la que me encuentro
    '''
    if not prev_column:
        prev_column['<S>'] = 1
    
    values ={}
    for tag in prev_column:
        key =   current_tag + '|' + tag
        prob = 0
        if key in transition_prob_matrix:
            prob = transition_prob_matrix[key]  # P = {current_tag|tag}
            
        v = prev_column[tag]
        values[tag] = v * prob
        result = max(values.items(), key=op.itemgetter(1))

    return result[0], result[1]


def valid_tag_for_path (viterbi, path, current_word):
    '''
    Evita asignar etiquetas erróneas a la palabra anterior cuando la probabilidad es 0. 
    Comprueba si hay valor viterbi para una palabra que no se haya almacenado o no hay valor para una palabra nueva
    
    Parameters
    ----------
    viterbi : double
        Valor viterbi 
    path : str
        Ruta almacenada
    current_word : str
        Palabra actual
    '''
    return (viterbi and not current_word in path) or (not viterbi and not current_word in path)


def get_viterbi_matrix(words_sentence):
    '''
    Calula la matriz Viterbi y devuelve la mejor ruta para una frase
    
    Parameters
    ----------
    words_sentence : List[str]
        Frase (string) donde aplicar el etiquetado 
    '''
    # Matriz con los valores Viterbi para cada palabra en la frase
    viterbi_matrix = {} 
    # Valores Viterbi de la columna anterior
    viterbi_prev_col = {}
    # Ruta mas probable correspondiente al analisis morfosintactico de la frase
    path = {}
    lastTag = ''
    for word in words_sentence:
        col_values = {}
        for tag in tags_dict:
            tag_max_value, v_max_previous = max_prev_column(viterbi_prev_col, tag)
            key = word + '|' + tag
            v = 0
            # Si he encontrado la palabra en la matriz
            if key in emision_prob_matrix:
                v = v_max_previous * emision_prob_matrix[key]
                
                if valid_tag_for_path (v, path, word):
                    path[word] = tag_max_value
                    lastTag = tag
                
            col_values[tag] = v

        # Guardo los valores de la columna anterior
        viterbi_prev_col = col_values 
        viterbi_matrix[word] = viterbi_prev_col
        
    # Etiqueta correspondiente a la ultima columna    
    path[''] = lastTag
    
    return viterbi_matrix, path
            

In [11]:
def split_frase(frase_param):
    vect_frase = []
    linea = frase_param.split(" ")
    i=0
    for palabra in linea:
        punto_final = False
        if palabra.find(".")>0:
            punto_final = True
            palabra = palabra.replace(".",'')
        if len(palabra)>0:
            vect_frase.append(palabra.lower())
        if punto_final == True:
            vect_frase.append(".")
    return(vect_frase)

In [12]:
sentence = "Habla con el enfermo grave de trasplantes."

wordsSentence = split_frase(sentence)
print (wordsSentence)

['habla', 'con', 'el', 'enfermo', 'grave', 'de', 'trasplantes', '.']


### Matriz Viterbi

In [17]:
def get_df_viterbi_matrix(sentence_split):
    '''
    Devuelve la matriz Viterbi asociada a una frase 
    
    Parameters
    ----------
    sentence_tokenized : List[str]
        Frase previamente tokenizada donde cada palabra compone un elemento en una lista
    '''
    m_viterbi, best_path = get_viterbi_matrix(sentence_split)
    viterbi_df = pd.DataFrame(m_viterbi, columns = sentence_split)    

    # Me quedo solo con las filas que me interesan
    # viterbi_df = viterbi_df.loc[(viterbi_df != 0.0).any(axis=1)] 
    return viterbi_df
    
df_viterbi = get_df_viterbi_matrix(wordsSentence)
df_viterbi.head(10)

Unnamed: 0,habla,con,el,enfermo,grave,de,trasplantes,.
<S>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NP00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VSIP3S0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DI0FS0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCFS000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SPS00,0.0,0.000177,0.0,0.0,0.0,2.500849e-09,0.0,0.0
DA0MS0,0.0,0.0,4.2e-05,0.0,0.0,0.0,0.0,0.0
NCMS000,0.0,0.0,0.0,2.020475e-06,0.0,0.0,0.0,0.0
AQ0MS0,0.0,0.0,0.0,2.373256e-07,0.0,0.0,0.0,0.0
VMP00SM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Ruta más probable

In [18]:
m_viterbi, best_path = get_viterbi_matrix(wordsSentence)

def str_best_path(viterbi_best_path):
    '''
    Devuelve un string formateado con la ruta Viterbi
    
    viterbi_best_path : str
        Ruta Viterbi
    '''
    prev_word = ''
    sentence_tagged = ''
    for word in viterbi_best_path:
    #     print(best_path[word] + prev_word)
    #     prev_word =  '  -->  ' + word
        sentence_tagged = sentence_tagged + prev_word + '/' + viterbi_best_path[word] + '/ '
        prev_word = word
        
    return sentence_tagged


print (str_best_path(best_path))

/<S>/ habla/VMIP3S0/ con/SPS00/ el/DA0MS0/ enfermo/NCMS000/ grave/AQ0CS0/ de/SPS00/ trasplantes/NCMP000/ ./Fp/ 


# Parte 3: Tests
Vamos a a poner a prueba nuestro etiquetador morfosintáctico. Para ello vamos a cambiar el orden de las palabras en la frase manteniendo su consistencia contextual. 

### 3.1 Frase: _El enfermo grave habla de trasplantes._
Vemos que el algoritmo falla no se genera la matriz de viterbi porque no existe probabilidad de transición del estado inicial al artículo determinante

In [19]:
s1 = "El enfermo grave habla de trasplantes."
s_tokenized = split_frase(s1)
m_viterbi_test, best_path_test = get_viterbi_matrix(s_tokenized)

print (s_tokenized)

# Ruta mas probable
print (str_best_path(best_path_test))

# Visualizacion de la matriz
df_viterbi_test = get_df_viterbi_matrix(s_tokenized)

df_viterbi_test.head(10)

['el', 'enfermo', 'grave', 'habla', 'de', 'trasplantes', '.']
/<S>/ el/<S>/ enfermo/<S>/ grave/<S>/ habla/<S>/ de/<S>/ trasplantes/<S>/ ./Fp/ 


Unnamed: 0,el,enfermo,grave,habla,de,trasplantes,.
<S>,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NP00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VSIP3S0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DI0FS0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCFS000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SPS00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DA0MS0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCMS000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AQ0MS0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VMP00SM,0.0,0.0,0.0,0.0,0.0,0.0,0.0
