In [67]:
import pandas as pd
import random
import spacy
from spacy.util import minibatch
from spacy.training import Example
from spacy.scorer import Scorer

In [68]:
data = pd.read_csv('../data/csv/ner_dataset.csv', encoding= 'unicode_escape')

In [69]:
data_fillna = data.ffill(axis=0)

In [70]:
data_group = (
    data_fillna
      .groupby('Sentence #', as_index=False)[['Word', 'POS', 'Tag']]
      .agg(list)
)


In [71]:
data_group['Phase'] = data_group['Word'].apply(lambda x: ' '.join(x))

In [72]:
def remove_dublespaces(text, char):
    if f' {char} ' in text:
        text = text.replace(f' {char} ', f'{char} ')

    return text

In [None]:
list_all_pos = data['POS'].unique()
pos_chars_punctuation = [item for item in list_all_pos.tolist() if item not in pos_chars_punctuation]

for idx, row in data_group['Phase'].items():
    for char in pos_chars_punctuation:
        data_group.at[idx, 'Phase'] = remove_dublespaces(row, char)

In [75]:
data_group['Position'] = [[] for _ in range(len(data_group))]

for idx, row in data_group.iterrows():
    positions = []
    for idxW, word in enumerate(row['Word']):
        try:
            index = row['Phase'].index(word)
            lenWord = len(word)
            positions.append((index, index + lenWord, row['POS'][idxW]))
        except ValueError:
            # Palavra não encontrada na Phase
            pass
    data_group.at[idx, 'Position'] = positions

In [None]:
for idx, row in data_group.iterrows():
    items = row['Position']
    frase = list(row['Phase'])
    for item in items:
        if item[2] in pos_chars_punctuation:
            index = item[0]
            char = item[2]
            frase[index] = char
            row['Position'] = frase

In [None]:
def achar_posicoes(row):
    frase = row['Phase']
    palavras = row['Word'] if isinstance(row['Word'], list) else [row['Word']]
    pos_tags = row['POS'] if isinstance(row['POS'], list) else [row['POS']]

    posicoes = []
    start = 0  # garante busca progressiva para lidar com palavras repetidas
    for w, pos in zip(palavras, pos_tags):
        i = frase.find(w, start)
        if i == -1:  # palavra não encontrada
            posicoes.append((None, None, pos))
        else:
            posicoes.append((i, len(w) + i, pos))
            start = i + len(w)
    return posicoes

data_group['Position'] = data_group.apply(achar_posicoes, axis=1)


In [23]:
list_finished_dataset = []

for idx, row in data_group.iterrows():
    list_finished_dataset.append((row["Phase"], {"entities": row["Position"]}))

In [32]:
cut = int(len(list_finished_dataset) * 0.1)

DEV_DATA = list_finished_dataset[:cut]
TRAIN_DATA = list_finished_dataset[cut:]

In [None]:
TRAIN_DATA = [item for item in TRAIN_DATA if '\x85' not in item[0]]
DEV_DATA = [item for item in DEV_DATA if '\x85' not in item[0] and '\x94' not in item[0]]

In [None]:
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

for _, ann in TRAIN_DATA + DEV_DATA:
    for start, end, label in ann.get("entities", []):
        ner.add_label(label)

def make_examples(nlp, data):
    examples = []
    for text, ann in data:
        doc = nlp.make_doc(text)
        spans = []
        for start, end, label in ann["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")

            if span is None:
                raise ValueError(f"Span inválido em: {text!r} -> {(start, end, label)}")
            
            spans.append(span)
            
        doc_ents = {"entities": [(s.start_char, s.end_char, s.label_) for s in spans]}
        examples.append(Example.from_dict(doc, doc_ents))
    return examples

train_examples = make_examples(nlp, TRAIN_DATA)
dev_examples   = make_examples(nlp, DEV_DATA)

In [None]:
other_pipes = [p for p in nlp.pipe_names if p != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    n_iter = 2
    for itn in range(1, n_iter + 1):
        random.shuffle(train_examples)
        losses = {}
        for batch in minibatch(train_examples, size=4):
            nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
        if itn % 5 == 0 or itn == 1 or itn == n_iter:
            print(f"Iter {itn:02d} - loss: {losses.get('ner', 0):.4f}")

Iter 01 - loss: 76027.9281
Iter 02 - loss: 42598.9292


In [None]:
def evaluate(nlp, examples):
    pred_docs = list(nlp.pipe([ex.text for ex in examples]))
    pred_examples = [Example(pred, ex.reference) for pred, ex in zip(pred_docs, examples)]

    scorer = Scorer()
    scores = scorer.score(pred_examples)
    return {
        "precision": scores["ents_p"],
        "recall":    scores["ents_r"],
        "f1":        scores["ents_f"],
    }

metrics = evaluate(nlp, dev_examples)
print("DEV metrics:", metrics)

DEV metrics: {'precision': 0.976585440262365, 'recall': 0.9762875986428272, 'f1': 0.9764364967400008}


In [77]:
tests = [
    "Marcos works on Apple, in São Paulo.",
    "the Embraer is locaded in Brasil.",
    "Ana fly to new york.",
]
for t in tests:
    doc = nlp(t)
    print("\nTexto:", t)
    for ent in doc.ents:
        print(f" - {ent.text:<20} {ent.label_}")



Texto: Marcos works on Apple, in São Paulo.
 - Marcos               NNP
 - works                VBZ
 - on                   IN
 - Apple                NNP
 - ,                    ,
 - in                   IN
 - São                  NNP
 - Paulo                NNP
 - .                    .

Texto: the Embraer is locaded in Brasil.
 - the                  DT
 - Embraer              NNP
 - is                   VBZ
 - locaded              VBN
 - in                   IN
 - Brasil               NNP
 - .                    .

Texto: Ana fly to new york.
 - Ana                  JJ
 - fly                  NN
 - to                   TO
 - new                  JJ
 - york                 NN
 - .                    .
