In [None]:
import spacy
from spacy.training import Example
import json
from sklearn.model_selection import train_test_split
import os

# Step 1: Generate a base config for an NER model as a string
config = """
[paths]
train = null
dev = null

[system]
gpu_allocator = "pytorch"

[nlp]
lang = "xx"  # Replace with language code if available
pipeline = ["ner"]
batch_size = 1000

[components]

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"

[components.ner.model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
width = 96
depth = 2
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true

[training]
dropout = 0.5
optimizer = "Adam"

[training.optimizer]
@optimizers = "Adam.v1"
learn_rate = 0.001

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 1000

[initialize]
vectors = null
"""

# Save config to file
config_dir = "modelv3"
os.makedirs(config_dir, exist_ok=True)
config_path = os.path.join(config_dir, "config.cfg")
with open(config_path, "w") as f:
    f.write(config)

print(f"Configuration saved to {config_path}")

# Step 2: Initialize a blank model or load a pre-trained one if available
nlp = spacy.blank("xx")

# Add NER pipeline to the model
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Load the JSON file containing your training data
with open("annotated.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Add entity labels
for label in data['classes']:
    ner.add_label(label)

# Prepare training data (annotations) from the dataset
annotations = [item for item in data['annotations'] if item]  # Filter out None values

# Split the data into training and testing sets (80% training, 20% testing)
train_annotations, test_annotations = train_test_split(annotations, test_size=0.2, random_state=42)

# Convert the training data to spaCy's Example format
train_examples = []
for item in train_annotations:
    if item and isinstance(item, list) and len(item) == 2:  # Check for correct format
        text, annotations = item
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)  # Convert to spaCy's Example format
        train_examples.append(example)
    else:
        print(f"Skipping invalid item: {item}")

# train the model
optimizer = nlp.begin_training()

# Number of training iterations (epochs)
n_iter = 30
for epoch in range(n_iter):
    losses = {}
    # Shuffle the examples before each iteration
    for example in train_examples:
        nlp.update([example], sgd=optimizer, losses=losses)
    print(f"Epoch {epoch+1}/{n_iter} Losses: {losses}")

# Save the trained model
output_dir = config_dir
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")

# Load the trained model
nlp = spacy.load(output_dir)

#Finally, use the spacy train and config file to train the model on the prepared data in spacy format as shown below
#! python -m spacy train config.cfg --output ./ --paths.train ./chichewa_training_data.spacy 

# Test the model
doc = nlp("Matenda a shuga ndi matenda omwe amadziwika ndi kuchuluka kwa shuga m'magazi, kapena kuchuluka kwa shuga m'magazi.")

# Check if any entities are recognized
if doc.ents:
    # If entities are found, print them
    for ent in doc.ents:
        print(ent.text, ent.label_)
else:
    # If no entities are found, print a message
    print("No entities recognized.")