In [1]:
import spacy
from spacy.training import Example
from spacy.training import offsets_to_biluo_tags
import random
import pandas as pd
import json

In [2]:
def process_data(file_path):
    df = pd.read_csv(file_path, delimiter=';', encoding='utf-8')


    spacy_data = []
    count = 0

    for _, row in df.iterrows():
        sentence = row['text']
        entities = json.loads(row['entities'])

        ner_entities = []
        textcat_labels = {}

        # Khởi tạo tất cả các nhãn classification là 0
        labels = ["GROUPTASK", "PRIORITY", "STATUS"]  # Các nhãn có thể có
        for label in labels:
            textcat_labels[label] = 0  # Khởi tạo giá trị là 0

        for entity in entities:
            start = int(entity['start'])
            end = int(entity['end'])
            label = entity['label'].strip()  # Xóa khoảng trắng nếu có

            if start == 0 and end == 0:
                # Nếu start và end đều bằng 0, gán giá trị nhãn classification là 1
                if label in textcat_labels:
                    textcat_labels[label] = 1
            else:
                # Nếu có start và end khác 0, thêm vào NER
                ner_entities.append((start, end, label))

        annotation = {
            "entities": ner_entities,
            "cats": textcat_labels
        }

        count += 1
        spacy_data.append((sentence, annotation))

    return spacy_data, count

train_data, count = process_data('spacy_dataset.csv')

for data in train_data[:5]:
    print(data)

print(count)

('Please set a task in the Artemis project, about creating a user feedback system. This is an important task but not urgent.', {'entities': [(14, 21, 'PROJECT'), (22, 49, 'TASK')], 'cats': {'GROUPTASK': 1, 'PRIORITY': 1, 'STATUS': 1}})
('Create task to verify database integrity after recent updates. This is a star priority.', {'entities': [(13, 32, 'TASK')], 'cats': {'GROUPTASK': 0, 'PRIORITY': 1, 'STATUS': 1}})
('Add task to set up automated testing for backend services. This should be done by the end of the week.', {'entities': [(10, 38, 'TASK')], 'cats': {'GROUPTASK': 1, 'PRIORITY': 1, 'STATUS': 1}})
('Generate task to design a new user interface for the dashboard in Project Gaia. This is a high priority.', {'entities': [(46, 50, 'PROJECT'), (15, 40, 'TASK')], 'cats': {'GROUPTASK': 1, 'PRIORITY': 1, 'STATUS': 1}})
('Please set up a task for learning a new language. No rush, just a long-term goal.', {'entities': [(15, 30, 'TASK')], 'cats': {'GROUPTASK': 0, 'PRIORITY': 1, 'STATUS': 1}

In [3]:
import spacy
from spacy.training import Example
import random

nlp = spacy.blank("en")

ner = nlp.add_pipe("ner")
textcat = nlp.add_pipe("textcat_multilabel", last=True)

for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
    for cat in annotations.get("cats"):
        textcat.add_label(cat)

optimizer = nlp.begin_training()

def remove_overlapping_spans(spans):
    """Xóa các spans chồng chéo, chỉ giữ lại spans không chồng chéo."""
    spans = sorted(spans, key=lambda span: (span.start, span.end))
    resolved_spans = []
    for span in spans:
        if not resolved_spans or (span.start >= resolved_spans[-1].end):
            resolved_spans.append(span)
    return resolved_spans

for i in range(20):
    random.shuffle(train_data)
    losses = {}

    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses, drop=0.5, sgd=optimizer)

    print(f"Epoch {i + 1}: Losses: {losses}")

# Lưu mô hình
nlp.to_disk("spacy_model")

# Tải mô hình đã lưu
nlp2 = spacy.load("spacy_model")

# Kiểm tra với một đoạn văn bản mẫu
test_text = "Please set up a task for testing the system."
doc = nlp2(test_text)

# In ra các entity và category
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])
print("Categories:", doc.cats)



ValueError: [E103] Trying to set conflicting doc.ents: '(45, 49, 'PROJECT')' and '(15, 48, 'TASK')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.

In [30]:
def load_and_predict(model_path, text):
    # Load the trained spaCy model
    nlp = spacy.load(model_path)
    doc = nlp(text)
    
    entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    categories = {cat: score for cat, score in doc.cats.items()}
    
    return entities, categories

model_path = "spacy_model" 
text = "Add a task to enhance the existing AI models with new algorithms for Project Gaia. This is a high priority."

entities, categories = load_and_predict(model_path, text)

print("Entities:", entities)
print("Categories:", categories)

Entities: [('enhance the existing AI models', 14, 44, 'TASK'), ('Gaia', 77, 81, 'PROJECT')]
Categories: {'LOW': 0.9999922513961792, 'PENDING': 0.9999775886535645, 'MEDIUM': 0.9999822378158569, 'TO_DO': 0.999994158744812, 'STARTDATE': 0.9999948740005493, 'DEADLINE': 0.9999862909317017, 'HIGH': 0.9999912977218628, 'STAR': 0.9999853372573853, 'IN_PROGRESS': 0.9999909400939941, 'DONE': 0.9998431205749512}
