In [1]:
import spacy
from spacy.training import Example
from spacy.training import offsets_to_biluo_tags
import random
import pandas as pd
import json

In [2]:
# Load dataset
def process_csv_for_spacy(file_path):
    df = pd.read_csv(file_path, delimiter=';', encoding='utf-8')

    spacy_data = []
    count = 0

    for _, row in df.iterrows():
        sentence = row['text']
        entities = json.loads(row['entities'])

        ner_entities = []
        textcat_labels = {}

        for entity in entities:
            start = int(entity['start'])
            end = int(entity['end'])
            label = entity['label']

            if start == 0 and end == 0:
                textcat_labels[label] = 1
            else:
                ner_entities.append((start, end, label))

        annotation = {
            "entities": ner_entities,
            "cats": textcat_labels
        }
        count+=1 

        spacy_data.append((sentence, annotation))

    return spacy_data, count

train_data, count = process_csv_for_spacy('spacy_dataset.csv')

for data in train_data[:5]:
    print(data)

print(count)

('Please set a task in the Artemis project, about creating a user feedback system. This is an important task but not urgent.', {'entities': [(25, 32, 'PROJECT'), (48, 79, 'TASK')], 'cats': {'MEDIUM': 1, 'PENDING': 1}})
('Create task to verify database integrity after recent updates. This is a star priority.', {'entities': [(15, 40, 'TASK')], 'cats': {'STAR': 1, 'IN_PROGRESS': 1, 'STARTDATE': 1}})
('Add task to set up automated testing for backend services. This should be done by the end of the week.', {'entities': [(12, 57, 'TASK')], 'cats': {'MEDIUM': 1, 'TO_DO': 1, 'STARTDATE': 1, 'DEADLINE': 1}})
('Generate task to design a new user interface for the dashboard in Project Gaia. This is a high priority.', {'entities': [(74, 78, 'PROJECT'), (17, 62, 'TASK')], 'cats': {'HIGH': 1, 'TO_DO': 1, 'STARTDATE': 1}})
('Please set up a task for learning a new language. No rush, just a long-term goal.', {'entities': [(25, 48, 'TASK')], 'cats': {'LOW': 1, 'PENDING': 1}})
281


In [10]:
import spacy
from spacy.training import Example
import random

nlp = spacy.blank("en")

ner = nlp.add_pipe("ner")
textcat = nlp.add_pipe("textcat_multilabel", last=True)

for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

for _, annotations in train_data:
    for cat in annotations.get("cats"):
        textcat.add_label(cat)

optimizer = nlp.begin_training()

def remove_overlapping_spans(spans):
    """Xóa các spans chồng chéo, chỉ giữ lại spans không chồng chéo."""
    spans = sorted(spans, key=lambda span: (span.start, span.end))
    resolved_spans = []
    for span in spans:
        if not resolved_spans or (span.start >= resolved_spans[-1].end):
            resolved_spans.append(span)
    return resolved_spans

for i in range(20):
    random.shuffle(train_data)
    losses = {}

    for text, annotations in train_data:
        doc = nlp.make_doc(text)

        spans = []
        for start, end, label in annotations.get("entities"):
            span = doc.char_span(start, end, label=label)
            if span is not None:
                spans.append(span)

        filtered_spans = remove_overlapping_spans(spans)

        doc.set_ents(filtered_spans)
        
        adjusted_entities = [(span.start_char, span.end_char, span.label_) for span in filtered_spans]
        annotations["entities"] = adjusted_entities

        example = Example.from_dict(doc, annotations)

        nlp.update([example], losses=losses, drop=0.5, sgd=optimizer)

    print(f"Epoch {i + 1}: Losses: {losses}")

# Lưu mô hình
nlp.to_disk("spacy_model")

# Tải mô hình đã lưu
nlp2 = spacy.load("spacy_model")

# Kiểm tra với một đoạn văn bản mẫu
test_text = "Please set up a task for testing the system."
doc = nlp2(test_text)

# In ra các entity và category
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])
print("Categories:", doc.cats)

Epoch 1: Losses: {'ner': 1200.6076548720707, 'textcat_multilabel': 3.6425167498740088}
Epoch 2: Losses: {'ner': 556.6258094420717, 'textcat_multilabel': 0.1033900003261892}
Epoch 3: Losses: {'ner': 428.44499754892905, 'textcat_multilabel': 0.008060696376010722}
Epoch 4: Losses: {'ner': 367.5084943176672, 'textcat_multilabel': 0.003836877783453027}
Epoch 5: Losses: {'ner': 332.5015584380442, 'textcat_multilabel': 0.003300988997435257}
Epoch 6: Losses: {'ner': 328.9625192346016, 'textcat_multilabel': 0.0038896064520438697}
Epoch 7: Losses: {'ner': 272.09610167129955, 'textcat_multilabel': 0.0006893763485115306}
Epoch 8: Losses: {'ner': 259.2377571409092, 'textcat_multilabel': 0.0007661981225302157}
Epoch 9: Losses: {'ner': 247.04025391528825, 'textcat_multilabel': 0.0008610059484547672}
Epoch 10: Losses: {'ner': 254.231976384895, 'textcat_multilabel': 0.0010187294291213326}
Epoch 11: Losses: {'ner': 242.15162642369395, 'textcat_multilabel': 0.00019521612113281538}
Epoch 12: Losses: {'ner

In [30]:
def load_and_predict(model_path, text):
    # Load the trained spaCy model
    nlp = spacy.load(model_path)
    doc = nlp(text)
    
    entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    categories = {cat: score for cat, score in doc.cats.items()}
    
    return entities, categories

model_path = "spacy_model" 
text = "Add a task to enhance the existing AI models with new algorithms for Project Gaia. This is a high priority."

entities, categories = load_and_predict(model_path, text)

print("Entities:", entities)
print("Categories:", categories)

Entities: [('enhance the existing AI models', 14, 44, 'TASK'), ('Gaia', 77, 81, 'PROJECT')]
Categories: {'LOW': 0.9999922513961792, 'PENDING': 0.9999775886535645, 'MEDIUM': 0.9999822378158569, 'TO_DO': 0.999994158744812, 'STARTDATE': 0.9999948740005493, 'DEADLINE': 0.9999862909317017, 'HIGH': 0.9999912977218628, 'STAR': 0.9999853372573853, 'IN_PROGRESS': 0.9999909400939941, 'DONE': 0.9998431205749512}
