In [31]:
import spacy
import random
import jsonlines
from spacy.training import Example, offsets_to_biluo_tags
from spacy.tokens import DocBin
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import os

In [32]:
# Load the SpaCy model
nlp = spacy.load("pt_core_news_md")

In [33]:
data = []
with jsonlines.open('dataset.jsonl', 'r') as reader:
    for line in reader:
        data.append(line)

In [34]:
# Shuffle the dataset to randomize the order
random.shuffle(data)

In [35]:
# Function to adjust entity spans according to SpaCy's token boundaries
def adjust_entity_spans(doc, entities):
    adjusted_entities = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            adjusted_entities.append((span.start_char, span.end_char, label))
    return adjusted_entities

In [36]:
# Convert the dataset to SpaCy format and adjust entity spans
def convert_data_for_spacy(data, nlp):
    spacy_format = []
    for item in data:
        text = item['text']
        doc = nlp.make_doc(text)
        entities = adjust_entity_spans(doc, item['label'])
        spacy_format.append((text, {"entities": entities}))
    return spacy_format

In [37]:
# Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert and adjust the training and test data
training_data = convert_data_for_spacy(train_data, nlp)
test_data_spacy = convert_data_for_spacy(test_data, nlp)

In [38]:
# Add the NER pipeline if not already present
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')

# Add custom labels to the NER component
for _, annotations in training_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [39]:
# Disable other pipeline components during training
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [40]:
# Fine-tune the model
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.resume_training()
    for iteration in range(20):  # You can change the number of iterations
        random.shuffle(training_data)
        losses = {}
        for text, annotations in training_data:
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], drop=0.5, losses=losses)
        print(f"Iteration {iteration + 1}, Losses: {losses}")

Iteration 1, Losses: {'ner': 2575.7900014288425}
Iteration 2, Losses: {'ner': 1698.62290668682}
Iteration 3, Losses: {'ner': 1549.0265327694733}
Iteration 4, Losses: {'ner': 1400.308129452041}
Iteration 5, Losses: {'ner': 1276.1925675071354}
Iteration 6, Losses: {'ner': 1222.2812704422956}
Iteration 7, Losses: {'ner': 1130.0735562070752}
Iteration 8, Losses: {'ner': 1077.2548294683036}
Iteration 9, Losses: {'ner': 1065.8895227458945}
Iteration 10, Losses: {'ner': 1030.9961921576416}
Iteration 11, Losses: {'ner': 989.7392447534589}
Iteration 12, Losses: {'ner': 956.0553419825991}
Iteration 13, Losses: {'ner': 963.5632797292383}
Iteration 14, Losses: {'ner': 888.8537368418592}
Iteration 15, Losses: {'ner': 907.4875525059841}
Iteration 16, Losses: {'ner': 856.424840624886}
Iteration 17, Losses: {'ner': 864.6060333757406}
Iteration 18, Losses: {'ner': 800.9673997441838}
Iteration 19, Losses: {'ner': 776.9984568839478}
Iteration 20, Losses: {'ner': 787.432477061417}


In [41]:
# Save the fine-tuned model
output_dir = "3VA/model_md"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")

Model saved to 3VA/model_md


In [42]:
# Test the fine-tuned model
nlp = spacy.load(output_dir)

In [43]:
# Initialize lists for true and predicted labels
true_labels = []
pred_labels = []

for text, annotations in test_data_spacy:
    doc = nlp(text)
    pred_entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    true_entities = annotations["entities"]

    # Print entities for debugging
    print(text)
    print(doc.ents)
    print("Predicted:", pred_entities)
    print("True:", true_entities)
    print("\n")

    # Create a dictionary to store the predicted and true labels with their spans
    pred_dict = {(start, end): label for start, end, label in pred_entities}
    true_dict = {(start, end): label for start, end, label in true_entities}

    # Union of all spans (keys) from both true and predicted dictionaries
    all_spans = set(pred_dict.keys()).union(set(true_dict.keys()))

    # Compare the labels based on spans
    for span in all_spans:
        true_labels.append(true_dict.get(span, "O"))  # "O" represents no entity in the true labels
        pred_labels.append(pred_dict.get(span, "O"))  # "O" represents no entity in the predicted labels

# Calculate precision, recall, and F1-score
print(classification_report(true_labels, pred_labels))

Soundbar 2.1 Canais Bluetooth 180W RMS com Subwoofer e USB | GT
(Soundbar, Bluetooth, RMS, com Subwoofer, USB, GT)
Predicted: [(0, 8, 'Categoria'), (20, 29, 'Conectividade'), (35, 38, 'Caracteristicas'), (39, 52, 'Item adicional'), (55, 58, 'Conectividade'), (61, 63, 'Marca')]
True: [(9, 19, 'Modelo'), (30, 38, 'Potência'), (39, 52, 'Item adicional'), (61, 63, 'Marca')]


Fone de Ouvido JBL Wireless Bluetooth T205 BT, Rose Gold
(Fone de Ouvido, JBL, Wireless, Bluetooth, T205 BT, Rose Gold)
Predicted: [(0, 14, 'Categoria'), (15, 18, 'Marca'), (19, 27, 'Conectividade'), (28, 37, 'Conectividade'), (38, 45, 'Modelo'), (47, 56, 'Modelo')]
True: [(0, 14, 'Categoria'), (15, 18, 'Marca'), (19, 27, 'Tipo'), (28, 37, 'Conectividade'), (38, 45, 'Modelo'), (47, 56, 'Cor')]


Vitrola Retro Bluetooth Pulse Morrison - SP613
(Vitrola, Retro, Bluetooth, Pulse, Morrison, SP613)
Predicted: [(0, 7, 'Categoria'), (8, 13, 'Caracteristicas'), (14, 23, 'Conectividade'), (24, 29, 'Marca'), (30, 38, 'Marca'), (

In [44]:
# Convert test data to spaCy Examples
examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in test_data_spacy]

# Evaluate the model using SpaCy's built-in evaluate function
results = nlp.evaluate(examples)

# Print the evaluation results
print(f"Precision: {results['ents_p']:.4f}")
print(f"Recall: {results['ents_r']:.4f}")
print(f"F1-score: {results['ents_f']:.4f}")
print(f"Entity-wise results: {results['ents_per_type']}")

Precision: 0.7243
Recall: 0.7668
F1-score: 0.7450
Entity-wise results: {'Categoria': {'p': 0.8494623655913979, 'r': 0.8229166666666666, 'f': 0.8359788359788359}, 'Conectividade': {'p': 0.7037037037037037, 'r': 0.7676767676767676, 'f': 0.7342995169082125}, 'Caracteristicas': {'p': 0.5373134328358209, 'r': 0.6792452830188679, 'f': 0.5999999999999999}, 'Item adicional': {'p': 0.75, 'r': 0.5, 'f': 0.6}, 'Marca': {'p': 0.8571428571428571, 'r': 0.9574468085106383, 'f': 0.9045226130653266}, 'Modelo': {'p': 0.6491228070175439, 'r': 0.592, 'f': 0.6192468619246863}, 'Potência': {'p': 0.68, 'r': 0.6296296296296297, 'f': 0.6538461538461539}, 'Cor': {'p': 0.8723404255319149, 'r': 0.9111111111111111, 'f': 0.891304347826087}, 'Tipo': {'p': 0.7647058823529411, 'r': 0.9285714285714286, 'f': 0.8387096774193549}, 'Tamanho': {'p': 0.15384615384615385, 'r': 0.4, 'f': 0.2222222222222222}, 'Tensão': {'p': 0.0, 'r': 0.0, 'f': 0.0}}


In [45]:
# Example text to test the model
doc = nlp("Fone JBL TWS Wave Buds, Auricular, Bluetooth, Preto - JBLWBUDSBLK")

# Print the entities recognized by the model
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Fone 0 4 Categoria
JBL 5 8 Marca
TWS 9 12 Conectividade
Wave Buds 13 22 Modelo
Auricular 24 33 Tipo
Bluetooth 35 44 Conectividade
Preto 46 51 Cor
