In [2]:
import pandas as pd
import json
import os
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin

# For training data

### Updating the json annotaion file

Making changes to the JSON annotation file as per spacy requirements

In [31]:
with open(r'D:\MACHINE LEARNING\Meeting Analysis\Data\annotations.json', 'r') as f:
    data = json.load(f)

In [32]:
data

{'classes': ['TEXT', 'ASSIGNEE'],
 'annotations': [['I have an important task that needs to be assigned to someone. It involves analyzing the market trends and preparing a report.\r',
   {'entities': [[75, 125, 'TEXT']]}],
  ['We need someone to take charge of the project management tasks for the upcoming product launch. Any volunteers?\r',
   {'entities': [[38, 56, 'TEXT']]}],
  ['I would like to assign the task of conducting user research and gathering feedback to John. Are you available, John?\r',
   {'entities': [[46, 82, 'TEXT']]}],
  ["As discussed earlier, we need to assign the coding tasks for the new feature implementation. Let's distribute them among the development team.\r",
   {'entities': [[44, 92, 'TEXT']]}],
  ['We have an urgent task of organizing the client meeting next week. Can someone from the sales team take care of it?\r',
   {'entities': [[26, 55, 'TEXT']]}],
  ["We have a task to prepare a detailed project plan. Let's assign this task to someone from the project

In [27]:
train_data = data['annotations']
train_data = [tuple(i) for i in train_data]

In [28]:
train_data

[('I have an important task that needs to be assigned to someone. It involves analyzing the market trends and preparing a report.\r',
  {'entities': [[75, 125, 'TEXT']]}),
 ('We need someone to take charge of the project management tasks for the upcoming product launch. Any volunteers?\r',
  {'entities': [[38, 56, 'TEXT']]}),
 ('I would like to assign the task of conducting user research and gathering feedback to John. Are you available, John?\r',
  {'entities': [[46, 82, 'TEXT']]}),
 ("As discussed earlier, we need to assign the coding tasks for the new feature implementation. Let's distribute them among the development team.\r",
  {'entities': [[44, 92, 'TEXT']]}),
 ('We have an urgent task of organizing the client meeting next week. Can someone from the sales team take care of it?\r',
  {'entities': [[26, 55, 'TEXT']]}),
 ("We have a task to prepare a detailed project plan. Let's assign this task to someone from the project management team.\r",
  {'entities': [[18, 50, 'TEXT']]}),
 

In [29]:
entity_name = ['TEXT', 'ASSIGNEE']
for i in train_data:
    if i[1]['entities'] == []:
        i[1]['entities'] = (0, 0, entity_name)
    else:
        i[1]['entities'][0] = tuple(i[1]['entities'][0])

### Building .spacy file

In [4]:
os.chdir(r'D:\MACHINE LEARNING\Meeting Analysis\model')


In [38]:
nlp = spacy.load("en_core_web_lg")
db = DocBin() 
for text, annot in tqdm(train_data):
    doc = nlp.make_doc(text)
    ents = []
    try:
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
    except TypeError as e:
        continue           
    doc.ents = ents 
    db.add(doc)

db.to_disk("./train.spacy")

100%|██████████████████████████████████████████████████████████████████████████████| 227/227 [00:00<00:00, 2002.22it/s]


# For validation data

### Updating the json annotaion file

Making changes to the JSON annotation file as per spacy requirements

In [39]:
with open(r'D:\MACHINE LEARNING\Meeting Analysis\Data\val_annotations.json', 'r') as f:
    data = json.load(f)

In [40]:
val_data = data['annotations']
val_data = [tuple(i) for i in train_data]

In [41]:
entity_name = ['TEXT', 'ASSIGNEE']
for i in val_data:
    if i[1]['entities'] == []:
        i[1]['entities'] = (0, 0, entity_name)
    else:
        try:
            i[1]['entities'][0] = tuple(i[1]['entities'][0])
        except TypeError as e:
            continue

### Building .spacy file

In [42]:
nlp = spacy.load("en_core_web_lg") # load other spacy model

db = DocBin() # create a DocBin object

for text, annot in tqdm(val_data): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    try:
        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
    except TypeError as e:
        continue           
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./validation.spacy") # save the docbin object

100%|██████████████████████████████████████████████████████████████████████████████| 227/227 [00:00<00:00, 2550.31it/s]


In [43]:
!python -m spacy init fill-config base_config.cfg config.cfg


[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [44]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./validation.spacy 

[+] Created output directory: output

[2023-06-19 19:58:15,548] [INFO] Set up nlp object from config
[2023-06-19 19:58:15,573] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-06-19 19:58:15,584] [INFO] Created vocabulary
[2023-06-19 19:58:21,819] [INFO] Added vectors: en_core_web_lg
[2023-06-19 19:58:21,860] [INFO] Finished initializing nlp object
[2023-06-19 19:58:23,064] [INFO] Initialized pipeline components: ['tok2vec', 'ner']



[i] Saving to output directory: output
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     45.90    8.44    5.87   15.00    0.08
  5     200         20.51   1658.78   90.41   89.37   91.47    0.90
 11     400         22.32    468.73   96.66   95.42   97.94    0.97
 20     600        140.89    455.29   96.63   96.21   97.06    0.97
 30     800        274.83    286.39   98.39   97.96   98.82    0.98
 42    1000         60.83    169.34   99.12   99.12   99.12    0.99
 57    1200        278.99    129.79   99.56   99.71   99.41    1.00
 76    1400         25.94    100.63   99.56   99.71   99.41    1.00
 98    1600         18.92     92.89   99.56   99.41   99.71    1.00
125    1800         24.90    117.32   99.56   99.71   99.41    1.00
158    2000        149.92    178

In [11]:
nlp1 = spacy.load(r".\output\model-best") #load the best model
entity={}
doc = nlp1("John, please coordinate with the IT department to implement the necessary software updates and ensure system security.")
for ent in doc.ents:
    entity[ent.label_] = ent.text
    print(entity)

{'ASSIGNEE': 'John'}
{'ASSIGNEE': 'John', 'TEXT': 'coordinate with the IT department to implement the necessary software updates and ensure system security'}
