In [None]:
!git clone https://github.com/PacktPublishing/Python-Natural-Language-Processing-Cookbook.git
%cd Python-Natural-Language-Processing-Cookbook

Cloning into 'Python-Natural-Language-Processing-Cookbook'...
remote: Enumerating objects: 308, done.[K
remote: Counting objects: 100% (84/84), done.[K
remote: Compressing objects: 100% (71/71), done.[K
remote: Total 308 (delta 36), reused 39 (delta 12), pack-reused 224 (from 1)[K
Receiving objects: 100% (308/308), 658.34 MiB | 17.82 MiB/s, done.
Resolving deltas: 100% (128/128), done.
Updating files: 100% (93/93), done.
/content/Python-Natural-Language-Processing-Cookbook


In [None]:
!pip install -U spacy
import spacy
from spacy.util import minibatch, compounding
from spacy.language import Language
from spacy.tokens import Doc, DocBin
from spacy.training import Example
import warnings
import random
from pathlib import Path


Collecting spacy
  Downloading spacy-3.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting blis<1.2.0,>=1.1.0 (from thinc<8.4.0,>=8.3.0->spacy)
  Downloading blis-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading spacy-3.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.1/29.1 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading thinc-8.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading blis-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import spacy
from spacy.util import minibatch, compounding
from spacy.language import Language
import warnings
import random
from pathlib import Path

DATA = [
    ("A fakir from far-away India travels to Asterix's village and asks Cacofonix to save his land from drought since his singing can cause rain.",
        {'entities':[(39, 46, "PERSON"), (66, 75, "PERSON")]}),
    ("Cacofonix, accompanied by Asterix and Obelix, must travel to India aboard a magic carpet to save the life of the princess Orinjade, who is to be sacrificed to stop the drought.",
        {'entities':[(0, 9, "PERSON"), (26, 33, "PERSON"), (38, 44, "PERSON"), (61, 66, "LOC"), (122, 130, "PERSON")]})
]

NEW_LABEL = "GAULISH_WARRIOR"

MODIFIED_DATA = [
    ("A fakir from far-away India travels to Asterix's village and asks Cacofonix to save his land from drought since his singing can cause rain.",
        {'entities':[(39, 46, NEW_LABEL), (66, 75, NEW_LABEL)]}),
    ("Cacofonix, accompanied by Asterix and Obelix, must travel to India aboard a magic carpet to save the life of the princess Orinjade, who is to be sacrificed to stop the drought.",
        {'entities':[(0, 9, NEW_LABEL), (26, 33, NEW_LABEL), (38, 44, NEW_LABEL), (61, 66, "LOC"), (122, 130, "PERSON")]})
]

N_ITER=100
OUTPUT_DIR = "/content/model_output"

def load_model(input_dir):
    nlp = spacy.load(input_dir)
    return nlp

def save_model(nlp, output_dir):
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)

def create_model(model):
    if (model is not None):
        nlp = spacy.load(model)
    else:
        nlp = spacy.blank("en")
    return nlp

# add ner to pipeline
def add_ner_to_model(nlp):
    if "ner" not in nlp.pipe_names:
        nlp.add_pipe("ner", last=True)  # add ner to pipeline
        ner = nlp.get_pipe("ner") # Assign ner here if the pipe is added
    else:
        ner = nlp.get_pipe("ner")
    return (nlp, ner)

# add labels in model nlp get named entity recognizer
def add_labels(ner, data):
    for sentence, annotations in data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    return ner


def train_model(model=None):
    nlp = create_model(model)
    (nlp, ner) = add_ner_to_model(nlp)
    ner = add_labels(ner, DATA)
    # remove models unnecessary avoid error calculation
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        warnings.filterwarnings("once", category=UserWarning, module='spacy')
        if model is None:
            nlp.begin_training()
        for itn in range(N_ITER):
            random.shuffle(DATA)
            losses = {}
            batches = minibatch(DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                examples = []
                for text, annotations in batch:

                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    # print(doc)
                    examples.append(example)
                nlp.update(
                    examples,
                    drop=0.5,
                    losses=losses,
                )
            print("Losses", losses)
    return nlp

def train_model_new_entity_type(model=None):
    random.seed(0)
    nlp = create_model(model)
    (nlp, ner) = add_ner_to_model(nlp)
    ner = add_labels(ner, MODIFIED_DATA)
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        warnings.filterwarnings("once", category=UserWarning, module='spacy')
        sizes = compounding(1.0, 4.0, 1.001)
        for itn in range(N_ITER):
            random.shuffle(MODIFIED_DATA)
            batches = minibatch(MODIFIED_DATA, size=sizes)
            losses = {}
            for batch in batches:
                # Create Example objects here
                examples = []
                for text, annotations in batch:
                    # print(text)
                    # print(annotations)
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    print(example)
                    examples.append(example)
                # Now update with the examples
                nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)
    return nlp

def test_model(nlp, data):
    for text, annotations in data:
        doc = nlp(text)
        for ent in doc.ents:
            print(ent.text, ent.start_char, ent.end_char, ent.label_)

def without_training(data=DATA):
    nlp = spacy.load("en_core_web_sm")
    test_model(nlp, data)

def main():
    without_training()
    model = "en_core_web_sm"
    nlp = train_model(model)
    # nlp = train_model()
    # nlp = train_model_new_entity_type(model)
    test_model(nlp, DATA)
    # save_model(nlp, OUTPUT_DIR)

def load_and_test(model_dir, data=DATA):
    nlp = load_model(model_dir)
    test_model(nlp, data)

if (__name__ == "__main__"):
    main()
    # load_and_test(OUTPUT_DIR)

India 22 27 GPE
Asterix 39 46 GPE
Cacofonix 66 75 PERSON
Asterix 26 33 GPE
Obelix 38 44 GPE
India 61 66 GPE
Orinjade 122 130 PERSON
Losses {'ner': 10.816574644377251}
Losses {'ner': 14.02861751815492}
Losses {'ner': 12.235323267819375}
Losses {'ner': 7.8435221830413315}
Losses {'ner': 8.319076934369617}
Losses {'ner': 8.111958773576152}
Losses {'ner': 6.337524429130268}
Losses {'ner': 5.3972868012490025}
Losses {'ner': 4.819037349346705}
Losses {'ner': 2.8742034144165043}
Losses {'ner': 0.47116196821473816}
Losses {'ner': 0.6707830067132383}
Losses {'ner': 2.0519958023320037}
Losses {'ner': 1.133775472200166}
Losses {'ner': 0.07059065569825973}
Losses {'ner': 0.527322972920661}
Losses {'ner': 1.289750860491968}
Losses {'ner': 1.2620443491183813}
Losses {'ner': 0.49220948269126846}
Losses {'ner': 0.10186482700272176}
Losses {'ner': 0.18773815908408606}
Losses {'ner': 1.5721654283005198}
Losses {'ner': 0.0014163737110346947}
Losses {'ner': 0.0031895746917702085}
Losses {'ner': 0.69567859

# Train for training_own_spacy_model with music data

In [None]:
!git clone https://github.com/PacktPublishing/Python-Natural-Language-Processing-Cookbook-Second-Edition.git

Cloning into 'Python-Natural-Language-Processing-Cookbook-Second-Edition'...
remote: Enumerating objects: 433, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 433 (delta 11), reused 6 (delta 2), pack-reused 409 (from 1)[K
Receiving objects: 100% (433/433), 18.28 MiB | 17.30 MiB/s, done.
Resolving deltas: 100% (235/235), done.


In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
%run -i "/content/Python-Natural-Language-Processing-Cookbook-Second-Edition/util/lang_utils.ipynb"

In [None]:
import pandas as pd
from spacy.cli.train import train
from spacy.cli.evaluate import evaluate
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

In [None]:
! git clone https://github.com/deezer/music-ner-eacl2023

Cloning into 'music-ner-eacl2023'...
remote: Enumerating objects: 107, done.[K
remote: Counting objects: 100% (107/107), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 107 (delta 37), reused 103 (delta 36), pack-reused 0 (from 0)[K
Receiving objects: 100% (107/107), 586.31 KiB | 8.26 MiB/s, done.
Resolving deltas: 100% (37/37), done.


In [None]:
music_ner_df = pd.read_csv('/content/Python-Natural-Language-Processing-Cookbook-Second-Edition/data/music_ner.csv')

In [None]:
music_ner_df.head()

Unnamed: 0,id,text,start_offset,end_offset,label
0,13434,i love radioheads kid a something similar | ki...,7,17,Artist_known
1,13434,i love radioheads kid a something similar | ki...,61,71,Artist_or_WoA_deduced
2,13435,anything similar to i fight dragons,20,35,WoA_deduced
3,13436,music similar to ccrs travelin band,17,30,Artist_deduced
4,13437,songs similar to blackout by boris,17,25,WoA_deduced


In [None]:
def change_label(input_label):
    input_label = input_label.replace("_deduced",'')
    return input_label

music_ner_df['label'] = music_ner_df['label'].apply(change_label)

In [None]:
train_db = DocBin()
test_db = DocBin()
# Get a unique list of unique ids
ids = list(set(music_ner_df["id"].values))
print(len(ids))
# Split ids into training and test
train_ids, test_ids = train_test_split(ids)
print(len(train_ids))
print(len(test_ids))

226
169
57


In [None]:
%cd /content/Python-Natural-Language-Processing-Cookbook-Second-Edition/data

/content/Python-Natural-Language-Processing-Cookbook-Second-Edition/data


In [None]:
import spacy
small_model = spacy.load("en_core_web_sm")

In [None]:
train_db

<spacy.tokens._serialize.DocBin at 0x7c60bf409090>

...

In [None]:
for id in ids:
    entity_rows = music_ner_df.loc[music_ner_df['id'] == id]
    text = entity_rows.head(1)["text"].values[0]
    doc = small_model(text)
    ents = []
    for index, row in entity_rows.iterrows():
        label = row["label"]
        start = row["start_offset"]
        end = row["end_offset"]
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        ents.append(span)
    print(ents)
    doc.ents = ents
    if id in train_ids:
        train_db.add(doc)
    else:
        test_db.add(doc)
train_db.to_disk('../data/music_ner_train.spacy')
test_db.to_disk('../data/music_ner_test.spacy')


[nghtmre street, anar]
[nujabes atlas]
[gsh, gaye]
[save yourself stabbing westward]
[zimmers, blade runner 2049]
[the llamas with hats, outro]
[bon iver, iron & wine]
[tally hall, miracle musical]
[system of a down]
[amon tobin & kid koala, untitled]
[code orange, dreams in inertia, code orange]
[the sounds of eden, blackbear and gnash]
[the muffs tilt, be your own pet, the soviettes tweens, dog party]
[ach so gern]
[kid rocks, greatest show on earth]
[airport bar, noah]
[sweet]
[smino sudan, archives, fjk, jessie reyez, tash sultana]
[an awesome wave, alt j]
[i love you like a alcoholic, tax payers]
[this song is not about a girl, flume & chet faker]
[little known game]
[guardians of the galaxy]
[the black keys, i got mine]
[solitude standing, suzanne vega]
[metro booming, no complaints, sneakin, drake]
[atlantis, bridgit mendler]
[a star is born]
[internetboi, bones, deadindesignerclothes]
[cindy, tammany hall nyc]
[the likes, rage against the machine, bring me the horizon, dz death

In [None]:
train("../data/spacy_config_ner.cfg", output_path="../models/spacy_music_ner")

[38;5;2m✔ Created output directory: ../models/spacy_music_ner[0m
[38;5;4mℹ Saving to output directory: ../models/spacy_music_ner[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'parser', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS PARSER  LOSS NER  TAG_ACC  DEP_UAS  DEP_LAS  SENTS_F  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  -----------  -----------  --------  -------  -------  -------  -------  ------  ------  ------  ------
  0       0          0.00        83.55       269.21     65.14    38.59    22.87    10.34     3.55    0.00    0.00    0.00    0.18
  2     200        747.11      2971.34     10556.95   4218.85    78.08    70.05    63.70    62.77   22.10   24.69   20.00    0.56
  5     400        822.31       301.75      2418.65    857.20    79.53    68.24    64.07    86.67   24.08   25.27   23.00    0.57
  8     600        703.66       160.24

KeyboardInterrupt: 