# Custom NER (ML NER)

## #1. Setup development environment

### Update & import Python modules

In [1]:
# install and download spaCy related modules
!pip install --upgrade spacy
!python -m spacy download en_core_web_lg

# spaCy
import spacy
from spacy.tokens import DocBin

# Google Drive
from google.colab import drive

# Firebase/Firestore
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

# general Python modules
import json
import datetime
import requests
import csv
import random
import warnings
from collections import Counter
from pprint import pprint

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.4.4
    Uninstalling spacy-3.4.4:
      Successfully uninstalled spacy-3.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.4.1 requires spacy<3.5.0,>=3.4.0, but you have spacy 3.5.0 which is incompatible.[0m[31m
[0mSuccessfully installed spacy-3.5.0
2023-01-23 20:58:41.574807: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected



### Get access to Firebase and Drive 

In [3]:
# remount drive, forced if needed
drive.mount("/content/gdrive/", force_remount = True)
print("Stablished access to Google Drive")

# initialize Drive path
DRIVE_PATH = "/content/gdrive/My Drive"

# open Firebase credentials
with open(DRIVE_PATH + "/IE/ie_course_2022_team03/credentials/firebase_credentials.json") as f:
  credential = json.load(f)
credential = credentials.Certificate(credential)

# create Firestore database instance
firebase_admin.initialize_app(credential)
db = firestore.client()
print("Stablished access to Firestore")

Mounted at /content/gdrive/
Stablished access to Google Drive
Stablished access to Firestore


# Training and Saving the Model

In [28]:
entity_slug_list = ["Berlin", "Munich", "Germany", "art_museum", "Bavaria"]

TRAIN_DATA, VALID_DATA = [], []

for entity_slug in entity_slug_list:

  with open(DRIVE_PATH + f"/IE/ie_course_2022_team03/output/{entity_slug.lower()}_ner_corpus.json") as f:
    DATA = json.load(f)
  TRAIN_DATA.extend(DATA[:int((len(DATA)*70)/100)])
  VALID_DATA.extend(DATA[int((len(DATA)*70)/100):])

print(len(TRAIN_DATA), TRAIN_DATA)
print(len(VALID_DATA), VALID_DATA)

def convert(TRAIN_DATA, output_name):
  nlp = spacy.blank("en") # create a blank NLP pipeline
  doc_bin = DocBin()
  for text, annot in TRAIN_DATA:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
      span = doc.char_span(start, end, label=label)
      if not span:
        msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
        warnings.warn(msg)
      else:
        ents.append(span)
    doc.ents = ents
    doc_bin.add(doc)
  doc_bin.to_disk(DRIVE_PATH + f"/IE/ie_course_2022_team03/output/ml_custom_ner/{output_name}")

convert(TRAIN_DATA, "train.spacy")
print("Saved trained NER component in pipeline")
convert(VALID_DATA, "valid.spacy")
print("Saved trained (validated) NER component in pipeline")

595 [['Berlin (/bɜːrˈlɪn/ bur-LIN, German: [bɛɐ̯ˈliːn] (listen)) is the capital and largest city of Germany by both area and population.', {'entities': [[0, 6, 'ENTITY']]}], ["One of Germany's sixteen constituent states, Berlin is surrounded by the State of Brandenburg and contiguous with Potsdam, Brandenburg's capital.", {'entities': [[45, 51, 'ENTITY']]}], ["Berlin's urban area, which has a population of around 4.5 million, is the second most populous urban area in Germany after the Ruhr.", {'entities': [[0, 6, 'ENTITY']]}], ["The Berlin-Brandenburg capital region has around 6.2 million inhabitants and is Germany's third-largest metropolitan region after the Rhine-Ruhr and Rhine-Main regions.", {'entities': [[4, 10, 'ENTITY']]}], ['Berlin straddles the banks of the Spree, which flows into the Havel (a tributary of the Elbe) in the western borough of Spandau.', {'entities': [[0, 6, 'ENTITY']]}], ['Due to its location in the European Plain, Berlin is influenced by a temperate seasonal 

# Loading and Testing the Model

In [29]:
!python -m spacy init fill-config /content/gdrive/MyDrive/IE/ie_course_2022_team03/output/ml_custom_ner/base_config.cfg /content/gdrive/MyDrive/IE/ie_course_2022_team03/output/ml_custom_ner/config.cfg

2023-01-23 22:31:18.837112: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/gdrive/MyDrive/IE/ie_course_2022_team03/output/ml_custom_ner/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [30]:
!python -m spacy train /content/gdrive/MyDrive/IE/ie_course_2022_team03/output/ml_custom_ner/config.cfg --paths.train /content/gdrive/MyDrive/IE/ie_course_2022_team03/output/ml_custom_ner/train.spacy --paths.dev /content/gdrive/MyDrive/IE/ie_course_2022_team03/output/ml_custom_ner/valid.spacy --output /content/gdrive/MyDrive/IE/ie_course_2022_team03/output/ml_custom_ner/ 

2023-01-23 22:31:32.911974: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;4mℹ Saving to output directory:
/content/gdrive/MyDrive/IE/ie_course_2022_team03/output/ml_custom_ner[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-01-23 22:31:33,924] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2023-01-23 22:31:33,940] [INFO] Pipeline: ['tok2vec', 'ner']
INFO:spacy:Pipeline: ['tok2vec', 'ner']
[2023-01-23 22:31:33,944] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2023-01-23 22:31:33,945] [INFO] Finished initializing nlp object
INFO:spacy:Finished initializing nlp object
[2023-01-23 22:31:34,625] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
INFO:spacy:Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2V

In [40]:
trained_nlp = spacy.load("/content/gdrive/MyDrive/IE/ie_course_2022_team03/output/ml_custom_ner/model-best")

text = "Munich and Berlin are two of the most popular cities in Germany"
doc = trained_nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

Munich ENTITY
Berlin ENTITY
