In [15]:
import spacy
import random
import jsonlines
from spacy.training import Example, offsets_to_biluo_tags
from spacy.tokens import DocBin
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import os

In [16]:
# Load the SpaCy model
nlp = spacy.load("pt_core_news_lg")

In [17]:
data = []
with jsonlines.open('dataset.jsonl', 'r') as reader:
    for line in reader:
        data.append(line)

In [18]:
# Shuffle the dataset to randomize the order
random.shuffle(data)

In [19]:
# Function to adjust entity spans according to SpaCy's token boundaries
def adjust_entity_spans(doc, entities):
    adjusted_entities = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            adjusted_entities.append((span.start_char, span.end_char, label))
    return adjusted_entities

In [20]:
# Convert the dataset to SpaCy format and adjust entity spans
def convert_data_for_spacy(data, nlp):
    spacy_format = []
    for item in data:
        text = item['text']
        doc = nlp.make_doc(text)
        entities = adjust_entity_spans(doc, item['label'])
        spacy_format.append((text, {"entities": entities}))
    return spacy_format

In [21]:
# Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert and adjust the training and test data
training_data = convert_data_for_spacy(train_data, nlp)
test_data_spacy = convert_data_for_spacy(test_data, nlp)

In [22]:
# Add the NER pipeline if not already present
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')

# Add custom labels to the NER component
for _, annotations in training_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [23]:
# Disable other pipeline components during training
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [24]:
# Fine-tune the model
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.resume_training()
    for iteration in range(20):  # You can change the number of iterations
        random.shuffle(training_data)
        losses = {}
        for text, annotations in training_data:
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], drop=0.5, losses=losses)
        print(f"Iteration {iteration + 1}, Losses: {losses}")

Iteration 1, Losses: {'ner': 2551.2318070686133}
Iteration 2, Losses: {'ner': 1698.2029801005035}
Iteration 3, Losses: {'ner': 1472.3969145690364}
Iteration 4, Losses: {'ner': 1375.2522923187178}
Iteration 5, Losses: {'ner': 1261.1235590317349}
Iteration 6, Losses: {'ner': 1213.7932695945103}
Iteration 7, Losses: {'ner': 1183.4964909293676}
Iteration 8, Losses: {'ner': 1083.0945156105786}
Iteration 9, Losses: {'ner': 1057.242941091014}
Iteration 10, Losses: {'ner': 1017.5459326620361}
Iteration 11, Losses: {'ner': 973.8865147019666}
Iteration 12, Losses: {'ner': 987.6464811715355}
Iteration 13, Losses: {'ner': 960.1715961225865}
Iteration 14, Losses: {'ner': 917.710470902036}
Iteration 15, Losses: {'ner': 894.5995577302469}
Iteration 16, Losses: {'ner': 877.945912405704}
Iteration 17, Losses: {'ner': 870.4786200633469}
Iteration 18, Losses: {'ner': 846.7784702281651}
Iteration 19, Losses: {'ner': 816.2684510419485}
Iteration 20, Losses: {'ner': 784.9000902588084}


In [25]:
# Save the fine-tuned model
output_dir = "3VA/model_lg"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")

Model saved to 3VA/model_lg


In [26]:
# Test the fine-tuned model
nlp = spacy.load(output_dir)

In [27]:
# Initialize lists for true and predicted labels
true_labels = []
pred_labels = []

for text, annotations in test_data_spacy:
    doc = nlp(text)
    pred_entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    true_entities = annotations["entities"]

    # Print entities for debugging
    print(text)
    print(doc.ents)
    print("Predicted:", pred_entities)
    print("True:", true_entities)
    print("\n")

    # Create a dictionary to store the predicted and true labels with their spans
    pred_dict = {(start, end): label for start, end, label in pred_entities}
    true_dict = {(start, end): label for start, end, label in true_entities}

    # Union of all spans (keys) from both true and predicted dictionaries
    all_spans = set(pred_dict.keys()).union(set(true_dict.keys()))

    # Compare the labels based on spans
    for span in all_spans:
        true_labels.append(true_dict.get(span, "O"))  # "O" represents no entity in the true labels
        pred_labels.append(pred_dict.get(span, "O"))  # "O" represents no entity in the predicted labels

# Calculate precision, recall, and F1-score
print(classification_report(true_labels, pred_labels))

Mini Torre Neon 2 Led Rgb 250w Bt/Sd/Fm/Aux/Tws Multilaser – SP400
(Mini Torre, Neon 2, Led Rgb, 250w, Bt/Sd/Fm/Aux/Tws, Multilaser, SP400)
Predicted: [(0, 10, 'Categoria'), (11, 17, 'Modelo'), (18, 25, 'Caracteristicas'), (26, 30, 'Potência'), (31, 47, 'Conectividade'), (48, 58, 'Marca'), (61, 66, 'Modelo')]
True: [(0, 10, 'Categoria'), (16, 25, 'Caracteristicas'), (26, 30, 'Potência'), (31, 47, 'Conectividade'), (48, 58, 'Marca'), (61, 66, 'Modelo')]


Fone De Ouvido Multilaser Monster High P2 - PH105
(Fone De Ouvido, Multilaser, Monster High P2, PH105)
Predicted: [(0, 14, 'Categoria'), (15, 25, 'Marca'), (26, 41, 'Modelo'), (44, 49, 'Modelo')]
True: [(0, 14, 'Categoria'), (15, 25, 'Marca'), (26, 38, 'Modelo'), (39, 41, 'Tipo'), (44, 49, 'Modelo')]


Fone JBL TWS Wave Buds, Auricular, Bluetooth, Preto - JBLWBUDSBLK
(Fone JBL, TWS Wave Buds, Auricular, Bluetooth, Preto)
Predicted: [(0, 8, 'Categoria'), (9, 22, 'Modelo'), (24, 33, 'Tipo'), (35, 44, 'Conectividade'), (46, 51, 'Cor')]
Tr

In [28]:
# Convert test data to spaCy Examples
examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in test_data_spacy]

# Evaluate the model using SpaCy's built-in evaluate function
results = nlp.evaluate(examples)

# Print the evaluation results
print(f"Precision: {results['ents_p']:.4f}")
print(f"Recall: {results['ents_r']:.4f}")
print(f"F1-score: {results['ents_f']:.4f}")
print(f"Entity-wise results: {results['ents_per_type']}")

Precision: 0.7982
Recall: 0.7438
F1-score: 0.7700
Entity-wise results: {'Categoria': {'p': 0.9191919191919192, 'r': 0.91, 'f': 0.9145728643216081}, 'Modelo': {'p': 0.6587301587301587, 'r': 0.6240601503759399, 'f': 0.6409266409266409}, 'Caracteristicas': {'p': 0.6818181818181818, 'r': 0.5454545454545454, 'f': 0.6060606060606061}, 'Potência': {'p': 0.7857142857142857, 'r': 0.6666666666666666, 'f': 0.721311475409836}, 'Conectividade': {'p': 0.7349397590361446, 'r': 0.7011494252873564, 'f': 0.7176470588235295}, 'Marca': {'p': 0.9069767441860465, 'r': 0.8666666666666667, 'f': 0.8863636363636364}, 'Tipo': {'p': 0.9, 'r': 0.9310344827586207, 'f': 0.9152542372881356}, 'Cor': {'p': 0.9272727272727272, 'r': 0.8947368421052632, 'f': 0.9107142857142856}, 'Item adicional': {'p': 1.0, 'r': 0.3333333333333333, 'f': 0.5}, 'Tamanho': {'p': 0.3333333333333333, 'r': 0.1111111111111111, 'f': 0.16666666666666666}, 'Tensão': {'p': 0.25, 'r': 0.5, 'f': 0.3333333333333333}}


In [29]:
# Example text to test the model
doc = nlp("Fone JBL TWS Wave Buds, Auricular, Bluetooth, Preto - JBLWBUDSBLK")

# Print the entities recognized by the model
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Fone JBL 0 8 Categoria
TWS Wave Buds 9 22 Modelo
Auricular 24 33 Tipo
Bluetooth 35 44 Conectividade
Preto 46 51 Cor
