In [2]:
import spacy
from spacy.training import Example
from spacy.training import offsets_to_biluo_tags
import random
import pandas as pd
import json

In [5]:
def process_data(file_path):
    df = pd.read_csv(file_path, delimiter=';', encoding='utf-8')

    spacy_data = []
    count = 0

    for _, row in df.iterrows():
        sentence = row['text']
        entities = json.loads(row['entities'])

        ner_entities = []
        textcat_labels = {}

        textcat_labels['GROUPTASK'] = ""
        textcat_labels['PRIORITY'] = ""
        textcat_labels['STATUS'] = ""

        for entity in entities:
            start = int(entity['start'])
            end = int(entity['end'])
            label = entity['label'].strip()
            value = entity.get('value', "")  

            if start == 0 and end == 0:
                if label in textcat_labels:
                    textcat_labels[label] = value 
            else:
                ner_entities.append((start, end, label))

        annotation = {
            "entities": ner_entities,  
            "cats": {  
                f"{label}_{value}": 1 if value else 0 for label, value in textcat_labels.items()
            }
        }

        count += 1
        spacy_data.append((sentence, annotation))

    return spacy_data, count

# Process the data
train_data, count = process_data('../../../data_lake/task_detection/spacy_dataset.csv')

# Print the first few examples and count of processed data
for data in train_data[:5]:
    print(data)
print(f"Total examples: {count}")

('Please set a task in the Artemis project, about creating a user feedback system. This is an important task but not urgent.', {'entities': [(25, 32, 'PROJECT'), (48, 79, 'TASK')], 'cats': {'GROUPTASK_User service': 1, 'PRIORITY_MEDIUM': 1, 'STATUS_PENDING': 1}})
('Create task to verify database integrity after recent updates. This is a star priority.', {'entities': [(15, 40, 'TASK')], 'cats': {'GROUPTASK_': 0, 'PRIORITY_STAR': 1, 'STATUS_IN_PROGRESS': 1}})
('Add task to set up automated testing for backend services. This should be done by the end of the week.', {'entities': [(12, 57, 'TASK')], 'cats': {'GROUPTASK_Default': 1, 'PRIORITY_MEDIUM': 1, 'STATUS_TO_DO': 1}})
('Generate task to design a new user interface for the dashboard in Project Gaia. This is a high priority.', {'entities': [(74, 78, 'PROJECT'), (17, 62, 'TASK')], 'cats': {'GROUPTASK_Client GUI': 1, 'PRIORITY_HIGH': 1, 'STATUS_TO_DO': 1}})
('Please set up a task for learning a new language. No rush, just a long-term goal

In [6]:
import spacy
from spacy.training import Example
import random

nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
textcat = nlp.add_pipe("textcat_multilabel", last=True)

for _, annotations in train_data:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])  
    for cat in annotations["cats"]:
        textcat.add_label(cat)

optimizer = nlp.begin_training()

for i in range(20):
    random.shuffle(train_data)
    losses = {}
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses, drop=0.5, sgd=optimizer)

    print(f"Epoch {i + 1}: Losses: {losses}")

nlp.to_disk("spacy_model_v2")



Epoch 1: Losses: {'ner': 1115.930023634683, 'textcat_multilabel': 1.6346015488893642}
Epoch 2: Losses: {'ner': 496.73723774769115, 'textcat_multilabel': 0.1762707298595778}
Epoch 3: Losses: {'ner': 336.0562123121512, 'textcat_multilabel': 0.04104975613383677}
Epoch 4: Losses: {'ner': 281.70593125308613, 'textcat_multilabel': 0.006272310473462661}
Epoch 5: Losses: {'ner': 248.94516064022727, 'textcat_multilabel': 0.002309878889170891}
Epoch 6: Losses: {'ner': 189.383872291942, 'textcat_multilabel': 0.0013084855720717492}
Epoch 7: Losses: {'ner': 154.37318064993406, 'textcat_multilabel': 0.004812918466078092}
Epoch 8: Losses: {'ner': 169.2615321074054, 'textcat_multilabel': 0.000780904955793249}
Epoch 9: Losses: {'ner': 136.39673239960416, 'textcat_multilabel': 0.00029238772763950927}
Epoch 10: Losses: {'ner': 136.01975117628615, 'textcat_multilabel': 0.00037582925267298783}
Epoch 11: Losses: {'ner': 139.7526450939215, 'textcat_multilabel': 0.0003688929898848503}
Epoch 12: Losses: {'ner'

In [7]:
nlp2 = spacy.load("spacy_model_v2")

test_text = "Please set up a task for testing the system."
doc = nlp2(test_text)

print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])
print("Categories:", doc.cats)

Entities: [('testing the system', 'TASK')]
Categories: {'GROUPTASK_User service': 0.9993239641189575, 'PRIORITY_MEDIUM': 0.9998352527618408, 'STATUS_PENDING': 0.9999353885650635, 'GROUPTASK_': 0.00028058310272172093, 'PRIORITY_STAR': 0.999931812286377, 'STATUS_IN_PROGRESS': 0.9998117089271545, 'GROUPTASK_Default': 0.9999080896377563, 'STATUS_TO_DO': 0.9999778270721436, 'GROUPTASK_Client GUI': 0.9998099207878113, 'PRIORITY_HIGH': 0.9998894929885864, 'PRIORITY_LOW': 0.9998779296875, 'GROUPTASK_AI Models': 0.9998167157173157, 'GROUPTASK_Do Chores': 0.9997547268867493, 'GROUPTASK_Task Manager': 0.998389482498169, 'GROUPTASK_Frontend': 0.9994639754295349, 'GROUPTASK_Data Processing': 0.9997314810752869, 'GROUPTASK_UI Module': 0.9981817007064819, 'GROUPTASK_GaiaBot': 0.9980140924453735, 'GROUPTASK_Deployment': 0.9996824264526367, 'GROUPTASK_Authentication Module': 0.9993554949760437, 'GROUPTASK_Middleware Loader': 0.9996144771575928, 'GROUPTASK_Auth Service': 0.9999626874923706, 'GROUPTASK_U

In [10]:
def load_and_predict(model_path, text):
    nlp = spacy.load(model_path)
    doc = nlp(text)
    
    entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    categories = {cat: score for cat, score in doc.cats.items()}
    
    return entities, categories

model_path = "spacy_model_v2" 
text = "Add a task to enhance the existing AI models with new algorithms for Project Gaia. This is a high priority."

entities, categories = load_and_predict(model_path, text)

print("Entities:", entities)
print("Categories:", categories)

Entities: [('enhance the existing AI models with new algorithms', 14, 64, 'TASK'), ('Gaia', 77, 81, 'PROJECT')]
Categories: {'GROUPTASK_User service': 0.9985288381576538, 'PRIORITY_MEDIUM': 0.9997768998146057, 'STATUS_PENDING': 0.9999303817749023, 'GROUPTASK_': 0.0002279213658766821, 'PRIORITY_STAR': 0.9999634027481079, 'STATUS_IN_PROGRESS': 0.9999028444290161, 'GROUPTASK_Default': 0.9998905658721924, 'STATUS_TO_DO': 0.9999815225601196, 'GROUPTASK_Client GUI': 0.9998151659965515, 'PRIORITY_HIGH': 0.999950647354126, 'PRIORITY_LOW': 0.99989914894104, 'GROUPTASK_AI Models': 0.9998140931129456, 'GROUPTASK_Do Chores': 0.9997325539588928, 'GROUPTASK_Task Manager': 0.9993065595626831, 'GROUPTASK_Frontend': 0.9994542002677917, 'GROUPTASK_Data Processing': 0.9995761513710022, 'GROUPTASK_UI Module': 0.9990614056587219, 'GROUPTASK_GaiaBot': 0.9982213377952576, 'GROUPTASK_Deployment': 0.9997441172599792, 'GROUPTASK_Authentication Module': 0.99957674741745, 'GROUPTASK_Middleware Loader': 0.99983954