In [1]:
import pandas as pd

In [None]:

data = pd.read_csv('ner_dataset.csv', encoding= 'unicode_escape')

In [None]:
data.head(25)

In [None]:
print(f"sentence #: {data['Sentence #'].count()}")
print(f"word: {data['Word'].count()}")
print(f"pos: {data['POS'].count()}")
print(f"tag: {data['Tag'].count()}")

In [None]:
data.columns

In [None]:
list_all_pos = data['POS'].unique()
data['POS'].unique()

In [None]:
pos_chars_unused = [item for item in list_all_pos.tolist() if not item.isupper()]
pos_chars_unused

In [None]:
data.head(15)

In [None]:
data_fillna = data.ffill(axis=0)  # substitui fillna(method='ffill') metodo usado para preencher para frente

In [None]:
data_fillna.head(25)

In [None]:
# Coninuar aqui
data_group = (
    data_fillna
      .groupby('Sentence #', as_index=False)[['Word', 'POS', 'Tag']]
      .agg(list)
)


In [None]:
print(data_group['Sentence #'].count())
print(data_group['Word'].count())
print(data_group['POS'].count())
print(data_group['Tag'].count())

In [None]:
# Supondo que a coluna 'Word' contenha listas de strings
data_group['Phase'] = data_group['Word'].apply(lambda x: ' '.join(x))


In [None]:
pd.set_option("display.max_colwidth", None)  # ou um número grande
print(data_group['Phase'].tail(1).to_string())


In [None]:
def remove_dublespaces(text, char):
    if f' {char} ' in text:
        text = text.replace(f' {char} ', f'{char} ')

    return text

In [None]:
for idx, row in data_group['Phase'].items():
    for char in pos_chars_unused:
        data_group.at[idx, 'Phase'] = remove_dublespaces(row, char)

In [None]:
data_group.head(1)

In [None]:
import pandas as pd

# Cria a coluna Position como lista vazia
data_group['Position'] = [[] for _ in range(len(data_group))]

for idx, row in data_group.iterrows():
    positions = []
    for idxW, word in enumerate(row['Word']):
        try:
            index = row['Phase'].index(word)
            lenWord = len(word)
            positions.append((index, index + lenWord, row['POS'][idxW]))
        except ValueError:
            # Palavra não encontrada na Phase
            pass
    data_group.at[idx, 'Position'] = positions


In [None]:
data_group['Position'].head(1)

In [None]:
data_group.head(1)

In [None]:
# pos_chars_unused

for idx, row in data_group.iterrows():
    items = row['Position']
    frase = list(row['Phase'])
    for item in items:
        if item[2] in pos_chars_unused:
            index = item[0]
            char = item[2]
            frase[index] = char
            row['Position'] = frase

In [None]:
import pandas as pd

def achar_posicoes(row):
    frase = row['Phase']
    palavras = row['Word'] if isinstance(row['Word'], list) else [row['Word']]
    pos_tags = row['POS'] if isinstance(row['POS'], list) else [row['POS']]

    posicoes = []
    start = 0  # garante busca progressiva para lidar com palavras repetidas
    for w, pos in zip(palavras, pos_tags):
        i = frase.find(w, start)
        if i == -1:  # palavra não encontrada
            posicoes.append((None, None, pos))
        else:
            posicoes.append((i, len(w) + i, pos))
            start = i + len(w)
    return posicoes

# Cria/atualiza a coluna de forma vetorizada e segura
data_group['Position'] = data_group.apply(achar_posicoes, axis=1)


In [None]:
data_group.tail(1)

In [None]:
data_group['Phase'].tail(1)

In [None]:
list_finished_dataset = []

for idx, row in data_group.iterrows():
    list_finished_dataset.append((row["Phase"], {"entities": row["Position"]}))

In [None]:
list_finished_dataset[1]

In [None]:
cut = int(len(list_finished_dataset) * 0.1)

DEV_DATA = list_finished_dataset[:cut]
TRAIN_DATA = list_finished_dataset[cut:]

In [None]:
TRAIN_DATA = [item for item in TRAIN_DATA if '\x85' not in item[0]]


In [None]:
DEV_DATA = [item for item in DEV_DATA if '\x85' not in item[0] and '\x94' not in item[0]]


In [None]:
# !pip install -U spacy>=3.7
import random
import spacy
from spacy.util import minibatch
from spacy.training import Example
from spacy.scorer import Scorer

In [None]:
# -----------------------------
# 1) Dados mock (corrigidos)
# -----------------------------
TRAIN_DATA = [
    ("João mora em São Paulo.", {"entities": [(0, 4, "PER"), (13, 22, "LOC")]}),
    ("Maria trabalha na Google em Belo Horizonte.", {"entities": [(0, 5, "PER"), (18, 24, "ORG"), (28, 42, "LOC")]}),
    ("A Apple comprou a Embraer.", {"entities": [(2, 7, "ORG"), (18, 25, "ORG")]}),
    ("A Ana visitou o Rio de Janeiro.", {"entities": [(2, 5, "PER"), (16, 30, "LOC")]}),
    ("Pedro nasceu em Lisboa.", {"entities": [(0, 5, "PER"), (16, 22, "LOC")]}),
    ("A Microsoft abriu escritório em Recife.", {"entities": [(2, 11, "ORG"), (32, 38, "LOC")]}),
]

DEV_DATA = [
    ("Carla foi para Porto Alegre ontem.", {"entities": [(0, 5, "PER"), (15, 27, "LOC")]}),
    ("Google contratou João em 2024.", {"entities": [(0, 6, "ORG"), (17, 21, "PER")]}),
]

In [None]:
import spacy

print("GPU disponível? ->", spacy.prefer_gpu())

# Teste cupy diretamente:
import cupy
x = cupy.arange(10)
print("Rodou no device:", x.device)


In [None]:
# --------------------------------------
# 2) Cria um pipeline NER do zero (pt)
# --------------------------------------
nlp = spacy.blank("en")           # modelo em branco (sem vocabulário treinado)
ner = nlp.add_pipe("ner")

# adiciona os rótulos vistos nos dados
for _, ann in TRAIN_DATA + DEV_DATA:
    for start, end, label in ann.get("entities", []):
        ner.add_label(label)

# helper para converter (text, ann) -> Example
def make_examples(nlp, data):
    examples = []
    for text, ann in data:
        doc = nlp.make_doc(text)
        spans = []
        for start, end, label in ann["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")

            if span is None:
                raise ValueError(f"Span inválido em: {text!r} -> {(start, end, label)}")
            
            spans.append(span)
            
        doc_ents = {"entities": [(s.start_char, s.end_char, s.label_) for s in spans]}
        examples.append(Example.from_dict(doc, doc_ents))
    return examples

train_examples = make_examples(nlp, TRAIN_DATA)
dev_examples   = make_examples(nlp, DEV_DATA)

In [None]:
# --------------------------------------
# 3) Treinamento
# --------------------------------------
# desabilite outros pipes (aqui só temos o 'ner' mesmo)
other_pipes = [p for p in nlp.pipe_names if p != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    n_iter = 30
    for itn in range(1, n_iter + 1):
        random.shuffle(train_examples)
        losses = {}
        # minibatches progressivamente maiores ajudam em dados pequenos
        for batch in minibatch(train_examples, size=4):
            nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
        if itn % 5 == 0 or itn == 1 or itn == n_iter:
            print(f"Iter {itn:02d} - loss: {losses.get('ner', 0):.4f}")

In [None]:
# 📌 Célula 4 – Avaliação (corrigida)
def evaluate(nlp, examples):
    # gera previsões de forma vetorizada
    pred_docs = list(nlp.pipe([ex.text for ex in examples]))
    pred_examples = [Example(pred, ex.reference) for pred, ex in zip(pred_docs, examples)]

    scorer = Scorer()
    scores = scorer.score(pred_examples)  # <-- passa a LISTA de Example
    return {
        "precision": scores["ents_p"],
        "recall":    scores["ents_r"],
        "f1":        scores["ents_f"],
    }

metrics = evaluate(nlp, dev_examples)
print("DEV metrics:", metrics)


In [None]:

# --------------------------------------
# 5) Teste rápido
# --------------------------------------
tests = [
    "Marcos works on Apple in São Paulo.",
    "the Embraer is locaded in Brasil.",
    "Ana fly to new york.",
]
for t in tests:
    doc = nlp(t)
    print("\nTexto:", t)
    for ent in doc.ents:
        print(f" - {ent.text:<20} {ent.label_}")


In [None]:
# 📌 Célula 7 – Salvar o modelo treinado
output_dir = "modelo_ner_en"
nlp.to_disk(output_dir)
print(f"Modelo salvo em: {output_dir}")


In [None]:
# 📌 Célula 8 – Carregar o modelo treinado
import spacy
nlp_carregado = spacy.load("modelo_ner_en")

# Teste rápido
doc = nlp_carregado("Tom went to Paris and met with Alice.")
for ent in doc.ents:
    print(ent.text, ent.label_)
