In [None]:
import spacy
from spacy.training import Example
from spacy.training import offsets_to_biluo_tags
import random
import pandas as pd
import json

In [None]:
# Load dataset
def process_csv_for_spacy(file_path):
    df = pd.read_csv(file_path)

    spacy_data = []
    count = 0

    for _, row in df.iterrows():
        sentence = row['text']
        entities = json.loads(row['entities'])

        ner_entities = []
        textcat_labels = {}

        for entity in entities:
            start = int(entity['start'])
            end = int(entity['end'])
            label = entity['label']

            if start == 0 and end == 0:
                textcat_labels[label] = 1
            else:
                ner_entities.append((start, end, label))

        annotation = {
            "entities": ner_entities,
            "cats": textcat_labels
        }
        count+=1 

        spacy_data.append((sentence, annotation))

    return spacy_data, count

train_data, count = process_csv_for_spacy('spacy_dataset.csv')

for data in train_data[:5]:
    print(data)

print(count)

In [None]:
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
textcat = nlp.add_pipe("textcat_multilabel", last=True)

# Thêm các label cho NER
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Thêm các label cho Text Classification
for _, annotations in train_data:
    for cat in annotations.get("cats"):
        textcat.add_label(cat)

# Bắt đầu huấn luyện mô hình
optimizer = nlp.begin_training()

for i in range(20):  # Số epoch huấn luyện
    random.shuffle(train_data)
    losses = {}
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses, drop=0.5, sgd=optimizer)
    print(f"Epoch {i + 1}: Losses: {losses}")

# Lưu mô hình đã huấn luyện
nlp.to_disk("spacy_model")

# Kiểm tra mô hình đã huấn luyện
nlp2 = spacy.load("spacy_model")

test_text = "Please set up a task for testing the system."
doc = nlp2(test_text)

print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])
print("Categories:", doc.cats)

In [None]:
def load_and_predict(model_path, text):
    # Load the trained spaCy model
    nlp = spacy.load(model_path)
    
    # Process the input text
    doc = nlp(text)
    
    # Extract entities
    entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    
    # Extract text categories
    categories = {cat: score for cat, score in doc.cats.items()}
    
    return entities, categories

# Example usage
model_path = "spacy_model"  # Path to your saved model
text = "Create task for building a new feature to monitor AI model performance in Project Gaia. This is medium priority."

entities, categories = load_and_predict(model_path, text)

print("Entities:", entities)
print("Categories:", categories)