In [None]:
!pip install spacy -q

In [11]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset("ai4privacy/pii-masking-43k", split='train', streaming=True)
dataset_list = list(dataset.take(1000))


In [12]:
dataset_list[0]

{'Template': 'In our video conference, discuss the role of evidence in the arbitration process involving [FULLNAME_1] and [FULLNAME_2].',
 'Filled Template': 'In our video conference, discuss the role of evidence in the arbitration process involving Dr. Marvin Rolfson and Julius Daugherty.',
 'Tokenised Filled Template': "['in', 'our', 'video', 'conference', ',', 'discuss', 'the', 'role', 'of', 'evidence', 'in', 'the', 'arbitration', 'process', 'involving', 'dr', '.', 'marvin', 'rolf', '##son', 'and', 'julius', 'da', '##ugh', '##erty', '.']",
 'Tokens': "['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-FULLNAME', 'I-FULLNAME', 'I-FULLNAME', 'I-FULLNAME', 'I-FULLNAME', 'O', 'B-FULLNAME', 'I-FULLNAME', 'I-FULLNAME', 'I-FULLNAME', 'O']"}

In [None]:
import spacy
from datasets import load_dataset
from spacy.tokens import DocBin
from tqdm import tqdm

dataset = load_dataset("ai4privacy/pii-masking-43k", split='train')
nlp = spacy.blank("en")
doc_bin = DocBin()
label_names = dataset.features['ner_tags'].feature.names
def create_spacy_doc(example):
    words = example['tokens']
    labels = example['ner_tags']
    doc = nlp.make_doc(" ".join(words))
    ents = []
    for start, label in enumerate(labels):
        if label != 0:  
            end = start + 1
            ent = doc.char_span(doc[start].idx, doc[end - 1].idx + len(doc[end - 1]), label=label_names[label])
            if ent:
                ents.append(ent)
    doc.ents = ents
    return doc

for example in tqdm(dataset):
    doc = create_spacy_doc(example)
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")


In [7]:
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm

nlp = spacy.blank("en")

def create_spacy_docs(dataset_list):
    docs = []
    for record in tqdm(dataset_list):
        text = record['Filled Template']
        tokens = record['Tokens']
        labels = record['Tokenised Filled Template'].split()
        
        entities = []
        start = 0
        for token, label in zip(tokens, labels):
            start = text.find(token, start)
            end = start + len(token)
            if label != 'O':  
                entities.append((start, end, label))
            start = end
        
        doc = nlp.make_doc(text)
        spans = [doc.char_span(start, end, label=label) for start, end, label in entities]
        filtered_spans = filter_spans([span for span in spans if span is not None])
        doc.ents = filtered_spans
        docs.append(doc)
    return docs

docs = create_spacy_docs(dataset_list)
doc_bin = DocBin(docs=docs)
doc_bin.to_disk("./train.spacy")


100%|██████████| 1000/1000 [00:00<00:00, 8248.58it/s]


In [None]:
train_docs = create_spacy_docs(dataset['train'])
dev_docs = create_spacy_docs(dataset['validation'])

# Save to disk
DocBin(docs=train_docs).to_disk("./train.spacy")
DocBin(docs=dev_docs).to_disk("./dev.spacy")

In [None]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy --output ./output


---

In [2]:
import pandas as pd
df = pd.read_csv('PII43k.csv')
print(df.head())


                                            Template  \
0  In our video conference, discuss the role of e...   
1  Could you draft a letter for [NAME_1] to send ...   
2  Discuss the options for [FULLNAME_1] who wants...   
3  13. Write a press release announcing [FULLNAME...   
4  9. Develop an inventory management plan for [F...   

                                     Filled Template  \
0  In our video conference, discuss the role of e...   
1  Could you draft a letter for Dietrich, Schulis...   
2  Discuss the options for Jeffery Pfeffer who wa...   
3  13. Write a press release announcing Gayle Wat...   
4  9. Develop an inventory management plan for Ev...   

                           Tokenised Filled Template  \
0  ['in', 'our', 'video', 'conference', ',', 'dis...   
1  ['could', 'you', 'draft', 'a', 'letter', 'for'...   
2  ['discuss', 'the', 'options', 'for', 'jeff', '...   
3  ['13', '.', 'write', 'a', 'press', 'release', ...   
4  ['9', '.', 'develop', 'an', 'inventory', 'm

In [3]:
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm
import ast

# Initialize a blank English model
nlp = spacy.blank("en")

# Create a DocBin object
doc_bin = DocBin()

# Iterate over the dataframe
for _, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Filled Template']
    tokens = ast.literal_eval(row['Tokenised Filled Template'])
    labels = ast.literal_eval(row['Tokens'])
    
    # Create a Doc object
    doc = nlp.make_doc(text)
    
    ents = []
    current_position = 0
    for token, label in zip(tokens, labels):
        token_start = text.find(token, current_position)
        token_end = token_start + len(token)
        current_position = token_end
        if label != 'O':  # 'O' means no entity
            span = doc.char_span(token_start, token_end, label=label)
            if span is not None:
                ents.append(span)
    
    # Filter out overlapping spans
    filtered_ents = filter_spans(ents)
    
    # Set entities for the doc
    doc.ents = filtered_ents
    
    # Add the doc to the DocBin
    doc_bin.add(doc)

# Save the DocBin to a file
doc_bin.to_disk("train.spacy")


100%|██████████| 42760/42760 [00:10<00:00, 4110.76it/s]


In [12]:
!python3 -m spacy init fill-config config.cfg  

[paths]
train = null
dev = null
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
vectors = {"@vectors":"spacy.Vectors.v1"}

[components]

[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = 

In [10]:
!python3 -m spacy init fill-config base_config.cfg base_config.cfg 

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
base_config.cfg
You can now add your data and train your pipeline:
python -m spacy train base_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
