### Train a custom NER

#### 0.imports

In [45]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
from spacy.tokens import DocBin
import json
from tqdm import tqdm
import random

In [2]:
nlp = spacy.load("en_core_web_sm")

#### 1. load data

In [5]:
df_cf=pd.read_csv('care_labels_parsed.csv', sep=';', index_col=0)

In [7]:
df_cf.head(2)

Unnamed: 0,product_id,product_category,care_label,main_prod_cat,log_parse_cat,raw_data,parsed,clean_data,len_raw_clean_text,test_length,rebuild_text,len_rebuild,test_test_pc,test_missing_weight
0,#113,PANTS,"Main: 40% Cotton, 60% Polyester, 290 g/m².\nCo...",PANTS,True,"Main: 40% Cotton, 60% Polyester, 290 g/m².\nCo...",{'default': {'Main_0': {'Materials': [{'materi...,"main 40% cotton, 60% polyester, 290 g/m2.contr...",137,True,"main_0 40% cotton, 60% polyester, 290 g/m2. co...",141,0,0
1,#212,PANTS,"Main: DuraTwill, 52% Cotton 48% Polyamide, 240...",PANTS,True,"Main: DuraTwill, 52% Cotton 48% Polyamide, 240...",{'default': {'Main_0': {'Materials': [{'materi...,"main duratwill, 52% cotton 48% polyamide, 240 ...",88,True,"main_0 52% cotton, 48% polyamide, 240 g/m2. re...",83,0,1


#### 2. try generic model  
works ok for % and density

In [11]:
text=df_cf.care_label.iloc[100]
doc = nlp(text)
displacy.render(doc, style="ent", jupyter=True)

#### 3. train custom model

In [13]:
with open('annotations.json', 'r') as f:
    data = json.load(f)


In [20]:
data['annotations'][1]

['62% Polyamide, 16% Polyester, 14% Polyurethane, 8% Rubber.\r',
 {'entities': [[0, 3, 'PERCENT'],
   [4, 13, 'MATERIAL'],
   [15, 18, 'PERCENT'],
   [19, 28, 'MATERIAL'],
   [30, 33, 'PERCENT'],
   [34, 46, 'MATERIAL'],
   [48, 50, 'PERCENT'],
   [51, 57, 'MATERIAL']]}]

In [29]:
# Step 2: Load spaCy Model (can be a blank model or a pre-trained one)
nlp = spacy.blank("en")  # Load a blank English model

# Step 3: Create a DocBin to store Doc objects
doc_bin = DocBin()

# Step 4: Process Training Data
for text, annotations in tqdm(data['annotations']):
    doc = nlp.make_doc(text)  # Create a spaCy Doc object from the raw text
    entities = annotations.get("entities")  # Extract entities from annotations
    print(entities)
    ents = []
    if entities==[]:
        pass
    else:
        # Create entity spans (start_char, end_char, label)
        for start, end, label in entities:
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)
        
        # Set entities on the Doc object
        doc.ents = ents
        
        # Add the processed doc to the DocBin
        doc_bin.add(doc)

# Step 5: Serialize and Save the DocBin to disk
with open("train_data.spacy", "wb") as f:
    f.write(doc_bin.to_bytes())

print("DocBin file created and saved as 'train_data.spacy'.")

100%|████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 3056.21it/s]

[[0, 4, 'GARMENTPART'], [6, 9, 'PERCENT'], [10, 16, 'MATERIAL'], [18, 21, 'PERCENT'], [22, 28, 'MATERIAL'], [30, 32, 'PERCENT'], [33, 43, 'MATERIAL'], [45, 54, 'DENSITY'], [55, 68, 'GARMENTPART'], [70, 73, 'PERCENT'], [74, 84, 'MATERIAL'], [86, 89, 'PERCENT'], [90, 98, 'MATERIAL'], [100, 103, 'PERCENT'], [104, 110, 'MATERIAL'], [112, 115, 'PERCENT'], [116, 122, 'MATERIAL'], [124, 126, 'PERCENT'], [127, 137, 'MATERIAL'], [139, 147, 'DENSITY']]
[[0, 3, 'PERCENT'], [4, 13, 'MATERIAL'], [15, 18, 'PERCENT'], [19, 28, 'MATERIAL'], [30, 33, 'PERCENT'], [34, 46, 'MATERIAL'], [48, 50, 'PERCENT'], [51, 57, 'MATERIAL']]
[[0, 4, 'GARMENTPART'], [6, 10, 'PERCENT'], [11, 20, 'MATERIAL'], [22, 30, 'DENSITY'], [31, 39, 'GARMENTPART'], [41, 44, 'PERCENT'], [59, 68, 'MATERIAL'], [70, 73, 'PERCENT'], [74, 81, 'MATERIAL'], [83, 85, 'PERCENT'], [86, 94, 'MATERIAL'], [96, 105, 'DENSITY'], [106, 116, 'GARMENTPART'], [118, 121, 'PERCENT'], [122, 149, 'MATERIAL'], [151, 155, 'PERCENT'], [156, 165, 'MATERIAL'],




In [36]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency --force

[38;5;3m[!] To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4m[i] Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m[+] Auto-filled config with all values[0m
[38;5;2m[+] Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [38]:
!python -m spacy train config.cfg --output ./output --paths.train ./train_data.spacy --paths.dev ./train_data.spacy

[38;5;4m[i] Saving to output directory: output[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     42.77   23.56   19.40   30.00    0.24
 37     200        345.88   2404.07   98.85   98.47   99.23    0.99
 82     400         49.78    127.27   99.23   99.23   99.23    0.99
137     600         48.20    113.08   99.23   99.23   99.23    0.99
204     800         77.89     57.05  100.00  100.00  100.00    1.00
292    1000         37.81     11.17  100.00  100.00  100.00    1.00
392    1200        124.76     27.86  100.00  100.00  100.00    1.00
492    1400        122.59     21.70  100.00  100.00  100.00    1.00
668    1600         11.47      2.34  100.00  100.00  100.00    1.00
868    1800         39.55      5.03 

In [48]:
nlp_ner_custom=spacy.load('./output/model-last')

#### 4.test custom model

In [49]:
for i in range(10):
    seed=random.randint(0,df_cf.shape[0])
    text=df_cf.care_label.iloc[seed]
    doc = nlp_ner_custom(text)
    displacy.render(doc, style="ent", jupyter=True)
    print('\n')







































