In [1]:
import spacy
from spacy.training import Example
from spacy.training import offsets_to_biluo_tags
import random
import pandas as pd
import json

In [18]:
def process_data(file_path):
    df = pd.read_csv(file_path, delimiter=';', encoding='utf-8')

    spacy_data = []
    count = 0

    for _, row in df.iterrows():
        sentence = row['text']
        entities = json.loads(row['entities'])

        ner_entities = []
        textcat_labels = {}

        # Khởi tạo tất cả các nhãn classification là 0
        labels = ["GROUPTASK", "PRIORITY", "STATUS"]  # Các nhãn có thể có
        for label in labels:
            textcat_labels[label] = 0  # Khởi tạo giá trị là 0

        for entity in entities:
            start = int(entity['start'])
            end = int(entity['end'])
            label = entity['label'].strip()  # Xóa khoảng trắng nếu có

            if start == 0 and end == 0:
                # Nếu start và end đều bằng 0, gán giá trị nhãn classification là 1
                if label in textcat_labels:
                    textcat_labels[label] = 1
            else:
                # Nếu có start và end khác 0, thêm vào NER
                ner_entities.append((start, end, label))

        annotation = {
            "entities": ner_entities,
            "cats": textcat_labels
        }

        count += 1
        spacy_data.append((sentence, annotation))

    return spacy_data, count

train_data, count = process_data('../../../data_lake/task_detection/spacy_dataset.csv')

for data in train_data[:5]:
    print(data)

print(count)

('Please set a task in the Artemis project, about creating a user feedback system. This is an important task but not urgent.', {'entities': [(25, 32, 'PROJECT'), (48, 79, 'TASK')], 'cats': {'GROUPTASK': 1, 'PRIORITY': 1, 'STATUS': 1}})
('Create task to verify database integrity after recent updates. This is a star priority.', {'entities': [(15, 40, 'TASK')], 'cats': {'GROUPTASK': 0, 'PRIORITY': 1, 'STATUS': 1}})
('Add task to set up automated testing for backend services. This should be done by the end of the week.', {'entities': [(12, 57, 'TASK')], 'cats': {'GROUPTASK': 1, 'PRIORITY': 1, 'STATUS': 1}})
('Generate task to design a new user interface for the dashboard in Project Gaia. This is a high priority.', {'entities': [(74, 78, 'PROJECT'), (17, 62, 'TASK')], 'cats': {'GROUPTASK': 1, 'PRIORITY': 1, 'STATUS': 1}})
('Please set up a task for learning a new language. No rush, just a long-term goal.', {'entities': [(25, 48, 'TASK')], 'cats': {'GROUPTASK': 0, 'PRIORITY': 1, 'STATUS': 1}

In [19]:
import spacy
from spacy.training import Example
import random

nlp = spacy.blank("en")

ner = nlp.add_pipe("ner")
textcat = nlp.add_pipe("textcat_multilabel", last=True)

for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

for _, annotations in train_data:
    for cat in annotations.get("cats"):
        textcat.add_label(cat)

optimizer = nlp.begin_training()

def remove_overlapping_spans(spans):
    """Xóa các spans chồng chéo, chỉ giữ lại spans không chồng chéo."""
    spans = sorted(spans, key=lambda span: (span.start, span.end))
    resolved_spans = []
    for span in spans:
        if not resolved_spans or (span.start >= resolved_spans[-1].end):
            resolved_spans.append(span)
    return resolved_spans

for i in range(20):
    random.shuffle(train_data)
    losses = {}

    for text, annotations in train_data:
        doc = nlp.make_doc(text)

        spans = []
        for start, end, label in annotations.get("entities"):
            span = doc.char_span(start, end, label=label)
            if span is not None:
                spans.append(span)

        filtered_spans = remove_overlapping_spans(spans)

        doc.set_ents(filtered_spans)
        
        adjusted_entities = [(span.start_char, span.end_char, span.label_) for span in filtered_spans]
        annotations["entities"] = adjusted_entities

        example = Example.from_dict(doc, annotations)

        nlp.update([example], losses=losses, drop=0.5, sgd=optimizer)

    print(f"Epoch {i + 1}: Losses: {losses}")

# Lưu mô hình
nlp.to_disk("spacy_model")

# Tải mô hình đã lưu
nlp2 = spacy.load("spacy_model")

# Kiểm tra với một đoạn văn bản mẫu
test_text = "Please set up a task for testing the system."
doc = nlp2(test_text)

# In ra các entity và category
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])
print("Categories:", doc.cats)

Epoch 1: Losses: {'ner': 1200.3737724782284, 'textcat_multilabel': 31.970471242243946}
Epoch 2: Losses: {'ner': 497.05233650988686, 'textcat_multilabel': 18.928745094809845}
Epoch 3: Losses: {'ner': 299.5201172000898, 'textcat_multilabel': 19.195366537202826}
Epoch 4: Losses: {'ner': 257.43427713893846, 'textcat_multilabel': 17.474917190208835}
Epoch 5: Losses: {'ner': 237.88137888490337, 'textcat_multilabel': 15.791728947273093}
Epoch 6: Losses: {'ner': 208.02789589948003, 'textcat_multilabel': 14.415980626872852}
Epoch 7: Losses: {'ner': 153.80672117568776, 'textcat_multilabel': 13.97221120259536}
Epoch 8: Losses: {'ner': 125.901503774315, 'textcat_multilabel': 10.778019184322222}
Epoch 9: Losses: {'ner': 138.1102500159863, 'textcat_multilabel': 11.574199489351109}
Epoch 10: Losses: {'ner': 136.05897977231734, 'textcat_multilabel': 8.921476046618114}
Epoch 11: Losses: {'ner': 103.71770558579071, 'textcat_multilabel': 9.146194909337563}
Epoch 12: Losses: {'ner': 85.1629545451122, 'tex

In [20]:
def load_and_predict(model_path, text):
    # Load the trained spaCy model
    nlp = spacy.load(model_path)
    doc = nlp(text)
    
    entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    categories = {cat: score for cat, score in doc.cats.items()}
    
    return entities, categories

model_path = "spacy_model" 
text = "Add a task to enhance the existing AI models with new algorithms for Project Gaia. This is a high priority."

entities, categories = load_and_predict(model_path, text)

print("Entities:", entities)
print("Categories:", categories)

Entities: [('enhance the existing AI models with new algorithms', 14, 64, 'TASK'), ('Gaia', 77, 81, 'PROJECT')]
Categories: {'GROUPTASK': 0.9999897480010986, 'PRIORITY': 0.9999779462814331, 'STATUS': 0.9999972581863403}
