In [1]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random
import pandas as pd
import json

In [6]:
# Load dataset
def process_csv_for_spacy(file_path):
    df = pd.read_csv(file_path)

    spacy_data = []
    count = 0

    for _, row in df.iterrows():
        sentence = row['text']
        entities = json.loads(row['entities'])

        # Tạo các danh sách cho NER và textcat
        ner_entities = []
        textcat_labels = {}

        for entity in entities:
            start = int(entity['start'])
            end = int(entity['end'])
            label = entity['label']

            if start == 0 and end == 0:
                # Đây là textcat
                textcat_labels[label] = 1
            else:
                # Đây là NER
                ner_entities.append((start, end, label))

        # Tạo annotation phù hợp với spaCy
        annotation = {
            "entities": ner_entities,
            "cats": textcat_labels
        }
        count 

        spacy_data.append((sentence, annotation))

    return spacy_data, count

train_data = process_csv_for_spacy('spacy_dataset.csv')

count = 0
for data in train_data[:5]:
    count += 1
    print(data)

print(count)

('Please set a task in the Artemis project, about creating a user feedback system. This is an important task but not urgent.', {'entities': [(25, 32, 'PROJECT'), (48, 79, 'TASK')], 'cats': {'GROUPTASK': 1, 'MEDIUM': 1, 'PENDING': 1}})
('Create task to verify database integrity after recent updates. This is a star priority.', {'entities': [(15, 40, 'TASK')], 'cats': {'STAR': 1, 'IN_PROGRESS': 1, 'STARTDATE': 1}})
('Add task to set up automated testing for backend services. This should be done by the end of the week.', {'entities': [(12, 57, 'TASK')], 'cats': {'GROUPTASK': 1, 'MEDIUM': 1, 'UNKNOWN': 1, 'STARTDATE': 1, 'DEADLINE': 1}})
('Generate task to design a new user interface for the dashboard in Project Gaia. This is a high priority.', {'entities': [(74, 78, 'PROJECT'), (17, 62, 'TASK')], 'cats': {'GROUPTASK': 1, 'UNKNOWN': 1, 'STARTDATE': 1}})
('Please set up a task for learning a new language. No rush, just a long-term goal.', {'entities': [(25, 48, 'TASK')], 'cats': {'LOW': 1, '

In [8]:
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
textcat = nlp.add_pipe("textcat_multilabel", last=True)

# Thêm các label cho NER
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Thêm các label cho Text Classification
for _, annotations in train_data:
    for cat in annotations.get("cats"):
        textcat.add_label(cat)

# Bắt đầu huấn luyện mô hình
optimizer = nlp.begin_training()

for i in range(20):  # Số epoch huấn luyện
    random.shuffle(train_data)
    losses = {}
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses, drop=0.5, sgd=optimizer)
    print(f"Epoch {i + 1}: Losses: {losses}")

# Lưu mô hình đã huấn luyện
nlp.to_disk("spacy_model")

# Kiểm tra mô hình đã huấn luyện
nlp2 = spacy.load("spacy_model")

test_text = "Please set up a task for testing the system."
doc = nlp2(test_text)

print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])
print("Categories:", doc.cats)



Epoch 1: Losses: {'ner': 872.4235991785068, 'textcat_multilabel': 5.5535726095965785}
Epoch 2: Losses: {'ner': 412.24660981672605, 'textcat_multilabel': 0.02962992267597686}
Epoch 3: Losses: {'ner': 309.24244574087845, 'textcat_multilabel': 0.0059368333045734245}
Epoch 4: Losses: {'ner': 263.0871859272844, 'textcat_multilabel': 0.004168383358010917}
Epoch 5: Losses: {'ner': 252.3807735487276, 'textcat_multilabel': 0.003237632309895433}
Epoch 6: Losses: {'ner': 202.1341656744994, 'textcat_multilabel': 0.0008683460993667846}
Epoch 7: Losses: {'ner': 194.03754816285507, 'textcat_multilabel': 0.000934442976001093}
Epoch 8: Losses: {'ner': 212.22478858095252, 'textcat_multilabel': 0.0009091725598251166}
Epoch 9: Losses: {'ner': 182.70035356569971, 'textcat_multilabel': 0.0009626601623426642}
Epoch 10: Losses: {'ner': 154.81344538023305, 'textcat_multilabel': 0.00024044712031215908}
Epoch 11: Losses: {'ner': 150.18701716484358, 'textcat_multilabel': 0.000483531359861223}
Epoch 12: Losses: {'

In [10]:
def load_and_predict(model_path, text):
    # Load the trained spaCy model
    nlp = spacy.load(model_path)
    
    # Process the input text
    doc = nlp(text)
    
    # Extract entities
    entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    
    # Extract text categories
    categories = {cat: score for cat, score in doc.cats.items()}
    
    return entities, categories

# Example usage
model_path = "spacy_model"  # Path to your saved model
text = "Create task for building a new feature to monitor AI model performance in Project Gaia. This is medium priority."

entities, categories = load_and_predict(model_path, text)

print("Entities:", entities)
print("Categories:", categories)

Entities: [('building a new feature to monitor AI model performance', 16, 70, 'TASK'), ('Gaia', 82, 86, 'PROJECT')]
Categories: {'MEDIUM': 0.999998927116394, 'UNKNOWN': 0.9999939203262329, 'STARTDATE': 0.9999665021896362, 'DEADLINE': 0.9999765157699585, 'GROUPTASK': 0.9999827146530151, 'STAR': 0.9999487400054932, 'IN_PROGRESS': 0.9999769926071167, 'DURATION': 0.999996542930603, 'PENDING': 0.9999865293502808, 'LOW': 0.9999827146530151}
