In [None]:
# Install spaCy and upload your data files
!pip install -U spacy

# If you need a blank model template (en), or use multilingual:
!python -m spacy download en_core_web_sm

Collecting spacy
  Downloading spacy-3.8.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Downloading spacy-3.8.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.8.5
    Uninstalling spacy-3.8.5:
      Successfully uninstalled spacy-3.8.5
Successfully installed spacy-3.8.6


In [6]:
from google.colab import files
uploaded = files.upload()    # select resumes.jsonl

Saving ner-sample.jsonl to ner-sample.jsonl


In [2]:
from google.colab import files
uploaded = files.upload()    # select labels.jsonl

Saving labels.json to labels.json


In [8]:
import json, spacy
from spacy.tokens import DocBin
from pathlib import Path

nlp = spacy.blank("en")       # create a blank English pipeline
db = DocBin()                 # container for serialized docs

# load label list
# Note: While you load labels here, they are not used as indices in the char_span call below
labels = [obj["text"] for obj in json.loads(Path("labels.json").read_text())]

# register those labels with the NER pipe
ner = nlp.add_pipe("ner")
for label_text in labels: # Changed variable name to avoid confusion with the 'label' from the JSONL
    ner.add_label(label_text)

# read your JSONL
with open("ner-sample.jsonl","r",encoding="utf8") as f:
    for line in f:
        entry = json.loads(line)
        doc = nlp.make_doc(entry["text"])
        ents = []
        for start, end, label_text_from_json in entry["label"]: # Use a different variable name for clarity
            # Use the label string directly from the JSON data
            span = doc.char_span(start, end, label=label_text_from_json, alignment_mode="contract")
            if span is not None:
                ents.append(span)
        doc.ents = ents
        db.add(doc)

db.to_disk("./train.spacy")

In [9]:
!python -m spacy init config config.cfg \
    --lang en \
    --pipeline ner \
    --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [10]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy --gpu-id 0

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    351.64    0.00    0.00    0.00    0.00
 66     200        856.13   7182.56  100.00  100.00  100.00    1.00
133     400          4.48      1.89  100.00  100.00  100.00    1.00
200     600          0.00      0.00  100.00  100.00  100.00    1.00


In [11]:
import spacy
nlp2 = spacy.load("./output/model-best")
text = "Alice graduated with a BE Electronics from Pune University in August 2000."
doc = nlp2(text)
for ent in doc.ents:
    print(ent.text, ent.label_)


BE Electronics from DEGREE


In [12]:
# 1. Install the HF Hub client
!pip install huggingface_hub



In [13]:
# 2. Log in to HF (you’ll be prompted to paste your token)
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
# 3. Create the repo on the Hub (requires that you’re logged in)
from huggingface_hub import HfApi

api = HfApi()
# this will create https://huggingface.co/<your-username>/resume-ner
api.create_repo(repo_id="resume-ner", private=False, repo_type="model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


RepoUrl('https://huggingface.co/ISFarzi/resume-ner', endpoint='https://huggingface.co', repo_type='model', repo_id='ISFarzi/resume-ner')

In [15]:
# 4. Push your spaCy model directory into that repo
from huggingface_hub import Repository
import shutil
import os

# Path to your best spaCy model
local_model_dir = "output/model-best"

# Clone your new HF repo locally (it will create a folder "resume-ner")
repo = Repository(local_dir="resume-ner", clone_from=f"{api.whoami()['name']}/resume-ner")

# Copy over all model files
shutil.copytree(local_model_dir, os.path.join("resume-ner", "model-best"), dirs_exist_ok=True)

# Commit & push
repo.push_to_hub(commit_message="Upload spaCy resume-NER model")


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/ISFarzi/resume-ner into local empty directory.
Adding files tracked by Git LFS: ['model-best/ner/model', 'model-best/ner/moves', 'model-best/tok2vec/model', 'model-best/tokenizer', 'model-best/vocab/vectors']. This may take a bit of time if the files are large.


Upload file model-best/tok2vec/model:   1%|          | 32.0k/5.73M [00:00<?, ?B/s]

Upload file model-best/vocab/vectors: 100%|##########| 128/128 [00:00<?, ?B/s]

Upload file model-best/tokenizer:  43%|####2     | 32.0k/75.3k [00:00<?, ?B/s]

Upload file model-best/ner/moves: 100%|##########| 344/344 [00:00<?, ?B/s]

Upload file model-best/ner/model:  25%|##4       | 32.0k/129k [00:00<?, ?B/s]

Upload file model-best/vocab/lookups.bin: 100%|##########| 1.00/1.00 [00:00<?, ?B/s]

To https://huggingface.co/ISFarzi/resume-ner
   8fb159a..4aaaebc  main -> main

   8fb159a..4aaaebc  main -> main



'https://huggingface.co/ISFarzi/resume-ner/commit/4aaaebc19f939dfe414654b13115e7c51cab1837'