In [42]:
import spacy
from spacy.training import Example
from spacy.training import offsets_to_biluo_tags
import random
import pandas as pd
import json

In [43]:
def process_data(file_path):
    df = pd.read_csv(file_path, delimiter=';', encoding='utf-8')

    spacy_data = []
    count = 0

    for _, row in df.iterrows():
        sentence = row['text']
        entities = json.loads(row['entities'])

        ner_entities = []
        textcat_labels = {}

        textcat_labels['PRIORITY'] = ""
        textcat_labels['STATUS'] = ""

        for entity in entities:
            start = int(entity['start'])
            end = int(entity['end'])
            label = entity['label'].strip()
            value = entity.get('value', "")  

            if start == 0 and end == 0:
                if label in textcat_labels:
                    textcat_labels[label] = value 
            else:
                ner_entities.append((start, end, label))

        annotation = {
            "entities": ner_entities,  
            "cats": {  
                f"{label}_{value}": 1 if value else 0 for label, value in textcat_labels.items()
            }
        }

        count += 1
        spacy_data.append((sentence, annotation))

    return spacy_data, count

# Process the data
train_data, count = process_data('../../../data_lake/task_detection/spacy_dataset.csv')

# Print the first few examples and count of processed data
for data in train_data[:5]:
    print(data)
print(f"Total examples: {count}")

('Please set a task in the Artemis project, about creating a user feedback system. This is an important task but not urgent.', {'entities': [(25, 32, 'PROJECT'), (48, 79, 'TASK')], 'cats': {'PRIORITY_MEDIUM': 1, 'STATUS_PENDING': 1}})
('Create task to verify database integrity after recent updates. This is a star priority.', {'entities': [(15, 40, 'TASK')], 'cats': {'PRIORITY_STAR': 1, 'STATUS_IN_PROGRESS': 1}})
('Add task to set up automated testing for backend services. This should be done by the end of the week.', {'entities': [(12, 57, 'TASK')], 'cats': {'PRIORITY_MEDIUM': 1, 'STATUS_TO_DO': 1}})
('Generate task to design a new user interface for the dashboard in Project Gaia. This is a high priority.', {'entities': [(74, 78, 'PROJECT'), (17, 62, 'TASK')], 'cats': {'PRIORITY_HIGH': 1, 'STATUS_TO_DO': 1}})
('Please set up a task for learning a new language. No rush, just a long-term goal.', {'entities': [(25, 48, 'TASK')], 'cats': {'PRIORITY_LOW': 1, 'STATUS_PENDING': 1}})
Total exa

In [47]:
import spacy
from spacy.training import Example
import random

nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
textcat = nlp.add_pipe("textcat_multilabel", last=True)

for _, annotations in train_data:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])  
    for cat in annotations["cats"]:
        textcat.add_label(cat)

optimizer = nlp.begin_training()

for i in range(20):
    random.shuffle(train_data)
    losses = {}
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses, drop=0.2, sgd=optimizer)

    print(f"Epoch {i + 1}: Losses: {losses}")

nlp.to_disk("spacy_model_v2")

Epoch 1: Losses: {'ner': 755.7973169488483, 'textcat_multilabel': 2.2696111658036546}
Epoch 2: Losses: {'ner': 201.97208759183647, 'textcat_multilabel': 0.09191292275275575}
Epoch 3: Losses: {'ner': 137.51958458608257, 'textcat_multilabel': 0.003470079510181634}
Epoch 4: Losses: {'ner': 91.53255220237193, 'textcat_multilabel': 0.0017052617983877028}
Epoch 5: Losses: {'ner': 67.49376039872207, 'textcat_multilabel': 0.0010146842869471762}
Epoch 6: Losses: {'ner': 96.82468198948, 'textcat_multilabel': 0.0010588860607567696}
Epoch 7: Losses: {'ner': 76.08149513440544, 'textcat_multilabel': 0.0007905326695947235}
Epoch 8: Losses: {'ner': 63.49648154255618, 'textcat_multilabel': 0.00048349265901971583}
Epoch 9: Losses: {'ner': 62.53243636484623, 'textcat_multilabel': 0.0002756125669575171}
Epoch 10: Losses: {'ner': 59.34816740617382, 'textcat_multilabel': 0.00048213797337115505}
Epoch 11: Losses: {'ner': 43.14812901160647, 'textcat_multilabel': 0.00021596597731210476}
Epoch 12: Losses: {'ner

In [48]:
nlp2 = spacy.load("spacy_model_v2")

test_text = "Please set up a task for testing the system."
doc = nlp2(test_text)

print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])
print("Categories:", doc.cats)

Entities: [('testing the system', 'TASK')]
Categories: {'PRIORITY_HIGH': 0.9998562335968018, 'STATUS_TO_DO': 0.9998817443847656, 'PRIORITY_MEDIUM': 0.9994654059410095, 'STATUS_IN_PROGRESS': 0.9997088313102722, 'PRIORITY_STAR': 0.9982501864433289, 'PRIORITY_LOW': 0.9989975094795227, 'STATUS_PENDING': 0.999580442905426, 'STATUS_DONE': 0.9763979911804199, 'PRIORITY_': 0.002389561152085662}


In [53]:
def load_and_predict(model_path, text):
    nlp = spacy.load(model_path)
    doc = nlp(text)
    
    entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    categories = {cat: score for cat, score in doc.cats.items()}
    
    return entities, categories

model_path = "spacy_model_v2" 
text = "Create a task to implement feature engineering techniques for Golde Model in Project Golde. This should be done within the month."

entities, categories = load_and_predict(model_path, text)

print("Entities:", entities)
print("Categories:", categories)

Entities: [('implement feature engineering techniques', 17, 57, 'TASK'), ('Golde', 85, 90, 'PROJECT')]
Categories: {'PRIORITY_HIGH': 0.9998356103897095, 'STATUS_TO_DO': 0.9999051094055176, 'PRIORITY_MEDIUM': 0.9998780488967896, 'STATUS_IN_PROGRESS': 0.9997349381446838, 'PRIORITY_STAR': 0.9960325360298157, 'PRIORITY_LOW': 0.9989074468612671, 'STATUS_PENDING': 0.9996355772018433, 'STATUS_DONE': 0.986890971660614, 'PRIORITY_': 0.004381465259939432}
