In [33]:
import spacy
import random
import jsonlines
from spacy.training import Example, offsets_to_biluo_tags
from spacy.tokens import DocBin
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import os

In [34]:
# Load the SpaCy model
nlp = spacy.load("pt_core_news_sm")

In [35]:
data = []
with jsonlines.open('dataset.jsonl', 'r') as reader:
    for line in reader:
        data.append(line)

In [36]:
# Shuffle the dataset to randomize the order
random.shuffle(data)

In [37]:
# Function to adjust entity spans according to SpaCy's token boundaries
def adjust_entity_spans(doc, entities):
    adjusted_entities = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            adjusted_entities.append((span.start_char, span.end_char, label))
    return adjusted_entities

In [38]:
# Convert the dataset to SpaCy format and adjust entity spans
def convert_data_for_spacy(data, nlp):
    spacy_format = []
    for item in data:
        text = item['text']
        doc = nlp.make_doc(text)
        entities = adjust_entity_spans(doc, item['label'])
        spacy_format.append((text, {"entities": entities}))
    return spacy_format

In [39]:
# Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert and adjust the training and test data
training_data = convert_data_for_spacy(train_data, nlp)
test_data_spacy = convert_data_for_spacy(test_data, nlp)

In [40]:
# Add the NER pipeline if not already present
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')

# Add custom labels to the NER component
for _, annotations in training_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [41]:
# Disable other pipeline components during training
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [42]:
# Fine-tune the model
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.resume_training()
    for iteration in range(20):  # You can change the number of iterations
        random.shuffle(training_data)
        losses = {}
        for text, annotations in training_data:
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], drop=0.5, losses=losses)
        print(f"Iteration {iteration + 1}, Losses: {losses}")

Iteration 1, Losses: {'ner': 2668.9379402744116}
Iteration 2, Losses: {'ner': 1783.0122555730406}
Iteration 3, Losses: {'ner': 1522.755116702746}
Iteration 4, Losses: {'ner': 1413.0859480122726}
Iteration 5, Losses: {'ner': 1298.0295452832959}
Iteration 6, Losses: {'ner': 1254.0821492908017}
Iteration 7, Losses: {'ner': 1158.2312676711963}
Iteration 8, Losses: {'ner': 1145.6000367498061}
Iteration 9, Losses: {'ner': 1085.2465623528688}
Iteration 10, Losses: {'ner': 998.3618262109056}
Iteration 11, Losses: {'ner': 966.9168154318679}
Iteration 12, Losses: {'ner': 959.5886045900882}
Iteration 13, Losses: {'ner': 937.3116329223598}
Iteration 14, Losses: {'ner': 908.6292592632087}
Iteration 15, Losses: {'ner': 863.9349467169569}
Iteration 16, Losses: {'ner': 832.4097065164898}
Iteration 17, Losses: {'ner': 768.4486479955392}
Iteration 18, Losses: {'ner': 836.4067834722406}
Iteration 19, Losses: {'ner': 757.6957499654941}
Iteration 20, Losses: {'ner': 786.9169786333001}


In [43]:
# Save the fine-tuned model
output_dir = "3VA/model_sm"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")

Model saved to 3VA/model_sm


In [44]:
# Test the fine-tuned model
nlp = spacy.load(output_dir)

In [45]:
# Initialize lists for true and predicted labels
true_labels = []
pred_labels = []

for text, annotations in test_data_spacy:
    doc = nlp(text)
    pred_entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    true_entities = annotations["entities"]

    # Print entities for debugging
    print(text)
    print(doc.ents)
    print("Predicted:", pred_entities)
    print("True:", true_entities)
    print("\n")

    # Create a dictionary to store the predicted and true labels with their spans
    pred_dict = {(start, end): label for start, end, label in pred_entities}
    true_dict = {(start, end): label for start, end, label in true_entities}

    # Union of all spans (keys) from both true and predicted dictionaries
    all_spans = set(pred_dict.keys()).union(set(true_dict.keys()))

    # Compare the labels based on spans
    for span in all_spans:
        true_labels.append(true_dict.get(span, "O"))  # "O" represents no entity in the true labels
        pred_labels.append(pred_dict.get(span, "O"))  # "O" represents no entity in the predicted labels

# Calculate precision, recall, and F1-score
print(classification_report(true_labels, pred_labels))

Fone de Ouvido Bluetooth JBL Tune 510BT Preto JBLT510BTBLK
(Fone de Ouvido, Bluetooth, JBL, Tune 510BT, Preto, JBLT510BTBLK)
Predicted: [(0, 14, 'Categoria'), (15, 24, 'Conectividade'), (25, 28, 'Marca'), (29, 39, 'Modelo'), (40, 45, 'Cor'), (46, 58, 'Modelo')]
True: [(0, 14, 'Categoria'), (15, 24, 'Conectividade'), (25, 28, 'Marca'), (29, 39, 'Modelo'), (40, 45, 'Cor'), (46, 58, 'Modelo')]


Over Ear Stereo Áudio - PH148
(Over Ear, Stereo, Áudio, PH148)
Predicted: [(0, 8, 'Categoria'), (9, 15, 'Caracteristicas'), (16, 21, 'Caracteristicas'), (24, 29, 'Modelo')]
True: [(0, 8, 'Categoria'), (9, 15, 'Caracteristicas'), (16, 21, 'Caracteristicas'), (24, 29, 'Modelo')]


Fone de Ouvido Intra-auricular c/ Microfone
(Fone de Ouvido, Intra-auricular, c/ Microfone)
Predicted: [(0, 14, 'Categoria'), (15, 30, 'Tipo'), (31, 43, 'Caracteristicas')]
True: [(0, 14, 'Categoria'), (15, 30, 'Tipo'), (31, 43, 'Caracteristicas')]


Pulse Wi-fi Speaker Smarty - SP358
(Pulse, Wi-fi Speaker, Smarty, SP358)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [46]:
# Convert test data to spaCy Examples
examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in test_data_spacy]

# Evaluate the model using SpaCy's built-in evaluate function
results = nlp.evaluate(examples)

# Print the evaluation results
print(f"Precision: {results['ents_p']:.4f}")
print(f"Recall: {results['ents_r']:.4f}")
print(f"F1-score: {results['ents_f']:.4f}")
print(f"Entity-wise results: {results['ents_per_type']}")

Precision: 0.7214
Recall: 0.7860
F1-score: 0.7523
Entity-wise results: {'Categoria': {'p': 0.8285714285714286, 'r': 0.8613861386138614, 'f': 0.8446601941747574}, 'Conectividade': {'p': 0.6881720430107527, 'r': 0.7901234567901234, 'f': 0.735632183908046}, 'Marca': {'p': 0.83, 'r': 0.9021739130434783, 'f': 0.8645833333333334}, 'Modelo': {'p': 0.6296296296296297, 'r': 0.6640625, 'f': 0.6463878326996197}, 'Cor': {'p': 0.9333333333333333, 'r': 0.9333333333333333, 'f': 0.9333333333333333}, 'Caracteristicas': {'p': 0.5211267605633803, 'r': 0.6981132075471698, 'f': 0.5967741935483871}, 'Tipo': {'p': 0.8157894736842105, 'r': 0.96875, 'f': 0.8857142857142857}, 'Potência': {'p': 0.5714285714285714, 'r': 0.64, 'f': 0.6037735849056605}, 'Tamanho': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'Item adicional': {'p': 1.0, 'r': 0.2, 'f': 0.33333333333333337}, 'Tensão': {'p': 0.4, 'r': 1.0, 'f': 0.5714285714285715}}


In [47]:
# Example text to test the model
doc = nlp("Fone JBL TWS Wave Buds, Auricular, Bluetooth, Preto - JBLWBUDSBLK")

# Print the entities recognized by the model
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Fone 0 4 Categoria
JBL 5 8 Marca
TWS Wave Buds 9 22 Modelo
Auricular 24 33 Tipo
Bluetooth 35 44 Conectividade
Preto 46 51 Cor
