In [51]:
import spacy
import random
import json
from spacy.training.example import Example

# Load the spaCy language model
nlp = spacy.blank('en')

# Define the text classification pipeline in the spaCy model
text_cat = nlp.add_pipe("textcat_multilabel")

In [52]:
# Load your training data from a JSONL file
with open("annotated_orders.jsonl", "r", encoding="utf-8") as file:
    annotated_orders = [json.loads(line) for line in file]

# Load your training data from a JSONL file
with open("label_config.json", "r", encoding="utf-8") as file:
    labels = json.load(file)

In [53]:
label_text = []

for label in labels:
    label_text.append(label['text'])

In [54]:
# Replace id with label text
for order in annotated_orders:

    # Get the annotations for the order
    annotations = order['annotations']

    # For each annotation
    for annotation in annotations:
        # Loop through labels
        for label in labels:
            # If label id matches the annotation id
            if (label['id'] == annotation['label']):
                # Update annotation's label id with text of the label
                annotation['label'] = label['text']

In [55]:
# Add the labels/categories you want to classify into
for label in label_text:
    text_cat.add_label(label)

In [56]:
# Set training data to 
training_data = annotated_orders

# Shuffle the training data for better training
random.shuffle(training_data)

In [57]:
# Create a temporary dictionary
temp = {}

# For each label in label text
for key in label_text:
    # Set dictionary entry to 0
    temp[key] = 0

In [58]:
# Create training examples empty list
train_examples = []

# For each order in training data
for order in training_data:
    text = order["text"]
    labels = {"cats": temp.copy()}

    annotations = order['annotations']

    for annotation in annotations:
        labels['cats'][annotation['label']] = 1

    example = Example.from_dict(nlp.make_doc(text), labels)
    train_examples.append(example)

In [59]:
# Train the text classification model
random.seed(1)  # for reproducibility
spacy.util.fix_random_seed(1)
text_cat_cfg = {"exclusive_classes": False, "architecture": "simple_cnn"}
text_cat_cfg.update({"n_iter": 10})  # adjust the number of iterations as needed
nlp.begin_training()

for epoch in range(text_cat_cfg["n_iter"]):
    random.shuffle(train_examples)
    losses = {}
    # Update the model with iterating over the training examples
    for example in train_examples:
        nlp.update([example], drop=0.5, losses=losses)

    print(losses)


# Save the trained model
nlp.to_disk("text_cat_model")

{'textcat_multilabel': 5.8554859682917595}
{'textcat_multilabel': 3.3550589829683304}
{'textcat_multilabel': 3.5458345655351877}
{'textcat_multilabel': 3.179036063142121}
{'textcat_multilabel': 3.055956121534109}
{'textcat_multilabel': 2.707000511698425}
{'textcat_multilabel': 2.6853528963401914}
{'textcat_multilabel': 2.4442971888929605}
{'textcat_multilabel': 2.597523578442633}
{'textcat_multilabel': 2.374800975434482}


In [60]:
# Load the saved model
path_to_saved_model = 'text_cat_model'

# Load the spaCy model
nlp = spacy.load(path_to_saved_model)

# Now, you can use the loaded model for various tasks
text = "The landlord did unlawfully enter the property."
doc = nlp(text)

In [62]:
nlp.pipe_names

['textcat_multilabel']