#Chile - Universidad Adolfo Ibáñez (UAI)
##Curso Analítica Textual
### Named-Entity Recognition

# Modelo Pre-Entrenado

In [1]:
#Importar Librerías
import spacy
from pprint import pprint

In [2]:
#Cargar Texto
text = "Orlando Bloom sorprendió con radical cambio de look: estaría en campaña para agrandar su familia con Katy Perry"

In [3]:
#Descargar Pipeline (Spanish)
!python -m spacy download es_core_news_sm -q

2022-08-19 09:59:45.052343: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[K     |████████████████████████████████| 12.9 MB 26.2 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


In [4]:
#Cargar Pipeline
nlp = spacy.load("es_core_news_sm")

In [5]:
#Crear Doc Object
doc = nlp(text)

In [6]:
#Identificación de Entidades
print("Original Text:", doc)
for ent in doc.ents:
  print(ent.text,"---", ent.label_)

Original Text: Orlando Bloom sorprendió con radical cambio de look: estaría en campaña para agrandar su familia con Katy Perry
Orlando Bloom --- PER
Katy Perry --- PER


In [7]:
#Visualización Entidades
from spacy import displacy
displacy.render(doc,style='ent',jupyter=True)

In [8]:
#Función para revisar Entidades
def get_entities(text, pipeline):
  doc = pipeline(text)
  for ent in doc.ents:
    print(ent.text,"---",ent.label_)

In [9]:
#Ejemplo Chileno
text = "Orlando Bloom sorprendió con radical cambio de look: estaría en campaña para agrandar su familia con Katy Perry en Melipilla"
get_entities(text, nlp)

Orlando Bloom --- PER
Katy Perry --- PER


In [10]:
#Ejemplo Chileno
text = "NotCo fue fundada en Melipilla"
get_entities(text, nlp)

NotCo --- MISC


# Evaluación Modelo NER

In [11]:
#Librerías Evaluación
from spacy.training.example import Example
from spacy.scorer import Scorer

In [12]:
#Datos Etiquetados (Train & Test Data)
train_data = [
              ("Mi mamá odia NotCo", {"entities": [(13, 18, "ORG")]}),
              ("Katy Perry ama NotCo.", {"entities": [(0, 10, "PER"), (15, 20, "ORG")]}),
              ("Orlando Bloom compró acciones en NotCo", {"entities": [(0, 13, "PER"), (33,38, "ORG")]}),
              ("Mi hermana nació en Melipilla", {"entities": [(20,29, "GPE")]}),
              ("Katy Perry visitó Melipilla", {"entities": [(0, 10, "PER"), (18,27, "GPE")]}),
              ("Los fundadores de NotCo nacieron en Melipilla", {"entities": [(18, 23, "ORG"), (36,45, "GPE")]}),
              ("No me gustó la comida en Melipilla", {"entities": [(25,34, "GPE")]})
              ]

test_data = [
              ("Orlando Bloom compró una casa en Melipilla", {"entities": [(0, 13, "PER"), (33,42, "GPE")]}),
              ("Mi postulación a NotCo fue rechazada", {"entities": [(17,22, "ORG")]}),
              ("Mi papá fue al colegio en Melipilla", {"entities": [(26,35, "GPE")]}),
              ("El que fue a Melipilla perdió su silla", {"entities": [(13,22, "GPE")]}),
              ("No me gustan los productos de NotCo", {"entities": [(30,35, "ORG")]}),
              ("Anoche hubieron protestas contra NotCo", {"entities": [(33,38, "ORG")]}),
              ]

In [13]:
#Predecir Entidades en Test Set
evaluation_data=list()
for text, annotations in test_data:

    doc_ground_text = nlp.make_doc(text)
    example = Example.from_dict(doc_ground_text, annotations)
    example.predicted = nlp(str(example.predicted))
    evaluation_data.append(example)

In [14]:
#Evaluar performance en Test Set
scorer = Scorer()
evaluation_metrics = scorer.score(evaluation_data)
pprint(evaluation_metrics)

{'cats_auc_per_type': {},
 'cats_f_per_type': {},
 'cats_macro_auc': 0.0,
 'cats_macro_f': 0.0,
 'cats_macro_p': 0.0,
 'cats_macro_r': 0.0,
 'cats_micro_f': 0.0,
 'cats_micro_p': 0.0,
 'cats_micro_r': 0.0,
 'cats_score': 0.0,
 'cats_score_desc': 'macro F',
 'dep_las': None,
 'dep_las_per_type': None,
 'dep_uas': None,
 'ents_f': 0.16666666666666666,
 'ents_p': 0.2,
 'ents_per_type': {'GPE': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                   'MISC': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                   'ORG': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                   'PER': {'f': 0.5, 'p': 0.3333333333333333, 'r': 1.0}},
 'ents_r': 0.14285714285714285,
 'morph_acc': None,
 'morph_micro_f': None,
 'morph_micro_p': None,
 'morph_micro_r': None,
 'morph_per_feat': None,
 'pos_acc': None,
 'sents_f': None,
 'sents_p': None,
 'sents_r': None,
 'tag_acc': None,
 'token_acc': 1.0,
 'token_f': 1.0,
 'token_p': 1.0,
 'token_r': 1.0}


# Modelo Customisado

In [15]:
#Importar Librerías
import random
from spacy.util import minibatch, compounding

In [16]:
#Revisar componentes del Pipeline
nlp.pipe_names

['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [17]:
#Deshabilitar componentes que NO vamos a customizar
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [18]:
#Customizar NER Model
with nlp.disable_pipes(*unaffected_pipes):

  #Entrenar por 30 Iteraciones
  for iteration in range(30):

    #Reordenar los ejemplos en cada iteración (para no aprender de memoria)
    random.shuffle(train_data)
    losses = {}
    
    #Crear batches de datos usando spaCy's minibatch
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    
    for batch in batches:
        for text, annotations in batch:
          doc = nlp.make_doc(text)
          example = Example.from_dict(doc, annotations)
          #texts, annotations = zip(*batch)
          nlp.update(
                    [example],
                    drop=0.5,  # Dropout - Evitar memorizar datos
                    losses=losses,
                )
        print("Losses", losses)

Losses {'ner': 9.544162696696006}
Losses {'ner': 15.987286836196931}
Losses {'ner': 6.936760158233942}
Losses {'ner': 13.373498690540906}
Losses {'ner': 8.72157538420217}
Losses {'ner': 12.029273285629008}
Losses {'ner': 7.337638143443235}
Losses {'ner': 11.382368785535778}
Losses {'ner': 4.937819459987395}
Losses {'ner': 10.45425054377901}
Losses {'ner': 8.127009186427166}
Losses {'ner': 12.135065719214927}
Losses {'ner': 3.4451482651695566}
Losses {'ner': 4.949965985154719}
Losses {'ner': 4.1574729760436195}
Losses {'ner': 8.683034018247652}
Losses {'ner': 2.212165614706434}
Losses {'ner': 5.908467145041046}
Losses {'ner': 0.8188429569953041}
Losses {'ner': 1.5486858220182862}
Losses {'ner': 1.3467811288208549}
Losses {'ner': 1.4633494823442317}
Losses {'ner': 0.8357898348094146}
Losses {'ner': 1.237483587368347}
Losses {'ner': 0.050521069591258676}
Losses {'ner': 0.0533572684316211}
Losses {'ner': 0.0020070073402552943}
Losses {'ner': 0.0023900672213154808}
Losses {'ner': 1.09384528

In [19]:
#Predecir Entidades en Test Set
evaluation_data_custom=list()
for text, annotations in test_data:

    doc_ground_text = nlp.make_doc(text)
    example = Example.from_dict(doc_ground_text, annotations)
    example.predicted = nlp(str(example.predicted))
    evaluation_data_custom.append(example)

scorer = Scorer()
evaluation_metrics_custom = scorer.score(evaluation_data_custom)
pprint(evaluation_metrics_custom)

{'cats_auc_per_type': {},
 'cats_f_per_type': {},
 'cats_macro_auc': 0.0,
 'cats_macro_f': 0.0,
 'cats_macro_p': 0.0,
 'cats_macro_r': 0.0,
 'cats_micro_f': 0.0,
 'cats_micro_p': 0.0,
 'cats_micro_r': 0.0,
 'cats_score': 0.0,
 'cats_score_desc': 'macro F',
 'dep_las': None,
 'dep_las_per_type': None,
 'dep_uas': None,
 'ents_f': 1.0,
 'ents_p': 1.0,
 'ents_per_type': {'GPE': {'f': 1.0, 'p': 1.0, 'r': 1.0},
                   'ORG': {'f': 1.0, 'p': 1.0, 'r': 1.0},
                   'PER': {'f': 1.0, 'p': 1.0, 'r': 1.0}},
 'ents_r': 1.0,
 'morph_acc': None,
 'morph_micro_f': None,
 'morph_micro_p': None,
 'morph_micro_r': None,
 'morph_per_feat': None,
 'pos_acc': None,
 'sents_f': None,
 'sents_p': None,
 'sents_r': None,
 'tag_acc': None,
 'token_acc': 1.0,
 'token_f': 1.0,
 'token_p': 1.0,
 'token_r': 1.0}


In [20]:
#Ejemplo Chileno
text = "Orlando Bloom compró una casa en Melipilla con la venta de sus acciones en NotCo"
get_entities(text, nlp)

Orlando Bloom --- PER
Melipilla --- GPE
NotCo --- ORG


# Componente basada en Reglas

In [21]:
#Ejemplo Chileno
text = "Katy Perry invirtió en acciones de NotCo y Camel Secure"
get_entities(text, nlp)

Katy Perry --- PER
NotCo --- ORG
Camel Secure --- PER


In [22]:
#Listar Reglas
patterns = [{"label": "ORG", "pattern": [{"LOWER": "camel"}, {"LOWER": "secure"}]}]

In [23]:
#Incorporar EntityRuler
config = {"overwrite_ents": True}
ruler = nlp.add_pipe("entity_ruler", config=config)

In [24]:
#Revisar componentes del Pipeline
nlp.pipe_names

['tok2vec',
 'morphologizer',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [25]:
#Incorporar Patrones al EntityRuler
ruler.add_patterns(patterns)

In [26]:
#Ejemplo Chileno
text = "Katy Perry invirtió en acciones de NotCo y Camel Secure"
get_entities(text, nlp)

Katy Perry --- PER
NotCo --- ORG
Camel Secure --- ORG


# Guardar & Cargar Pipelines

In [27]:
#Guardar Pipeline
from pathlib import Path
output_dir = Path("/content/")
nlp.to_disk(output_dir)
print("Modelo guardado en:", output_dir)

Modelo guardado en: /content


In [28]:
#Cargar Pipeline
nlp_custom = spacy.load(output_dir)

In [29]:
#Ejemplo Chileno
text = "Katy Perry invirtió en acciones de NotCo y Camel Secure"
get_entities(text, nlp_custom)

Katy Perry --- PER
NotCo --- ORG
Camel Secure --- ORG
