### Installing Transformer
spaCy is a free, open-source library for advanced Natural Language Processing (NLP) in Python.


In [1]:
! pip install spacy_transformers
! pip install -U spacy

Collecting spacy_transformers
  Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.37.0,>=3.4.0 (from spacy_transformers)
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy_transformers)
  Downloading spacy_alignments-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers<4.37.0,>=3.4.0->spacy_transformers)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (197 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m11.6 MB/s[0m eta [36m0:00

### Importing Libarires

In [2]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [3]:
spacy.__version__

'3.8.3'

In [4]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [5]:

cv_data = json.load(open('/content/drive/MyDrive/ResumeParser/dataset/dataset.json','r'))

In [6]:
len(cv_data)

1014

In [7]:
cv_data[0]

['\xa0 \xa0\nContact\nwww.linkedin.com/in/omjagri\n(LinkedIn)\nTop Skills\nphp\nMySQL\nJavaScript\nCertifications\nPhp & Js Om Prakash Jagri\nFull Stack Developer | PHP | Laravel | Vue Js\nKathmandu, Bāgmatī, Nepal\nSummary\nExperienced Developer with a demonstrated history of working in\nthe information technology and services industry. Skilled in Laravel,\nPHP, Cascading Style Sheets (CSS), JavaScript, vue js and MySQL.\nStrong engineering professional with a B.sc.CSIT(Bachelors of\nScience in Computer Science and Information Technology) focused\nin Computer Science from Tribhuvan University, Institute of Science\n& Tchnology. \nExperience\nSearchable Design LLC\nSoftware Developer\nJune 2021\xa0-\xa0Present\xa0 (1 year 7 months)\nNepal\nFull Stack Developer Laravel with Vue Js\nBenekiva\nTechnical Documentation\nSeptember 2021\xa0-\xa0Present\xa0 (1 year 4 months)\nUnited States\nBidhee\n3 years 9 months\nLaravel Developer\nMarch 2018\xa0-\xa0May 2021\xa0 (3 years 3 months)\nBaneswa

In [8]:
!python -m spacy init fill-config /content/drive/MyDrive/ResumeParser/config/base_config.cfg /content/drive/MyDrive/ResumeParser/config/config.cfg

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/ResumeParser/config/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


## Data Preprocessing

The function get_spacy_doc takes a file and data (assumed to be labeled text data) as input and processes the data into a format compatible with spaCy's DocBin for training. It uses a blank English spaCy model to create Doc objects for the input text and assigns annotated entities to these documents. If entity creation fails, errors are logged in the provided file. Finally, it returns the populated DocBin.

In [13]:
def get_spacy_doc(file, data):
    """
    Process annotated data to create SpaCy Doc objects and store them in a DocBin.

    Args:
        file (file object): File object to log errors.
        data (list of tuples): List containing tuples of text and its corresponding annotations.

    Returns:
        DocBin: A SpaCy DocBin object containing the processed documents.
    """
    nlp = spacy.blank('en')  # Create a blank SpaCy model
    db = DocBin()  # Initialize DocBin for storing processed documents

    for text, annot in tqdm(data):
        doc = nlp.make_doc(text)  # Create a SpaCy Doc object
        annotations = annot['entities']

        ents = []  # List to store entity spans
        entity_indices = []  # Tracks indices covered by entities

        for start, end, label in annotations:
            # Skip overlapping entities
            if any(idx in entity_indices for idx in range(start, end)):
                continue

            entity_indices.extend(range(start, end))  # Update covered indices

            try:
                # Create a span with strict alignment mode
                span = doc.char_span(start, end, label=label, alignment_mode='strict')
            except Exception as e:
                continue

            if span is None:
                # Log errors for spans that couldn't be created
                err_data = f"{[start, end]}   {text}\n"
                file.write(err_data)
            else:
                ents.append(span)

        try:
            doc.ents = ents  # Assign entities to the Doc object
            db.add(doc)  # Add the Doc to DocBin
        except Exception as e:
            pass  # Silently handle errors

    return db


## Data Splitting

In [14]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(cv_data, test_size = 0.3)


In [15]:
len(train), len(test)

(709, 305)

In [16]:

# Open a file in write mode to log errors that occur during processing
file = open('/content/drive/MyDrive/ResumeParser/model/train_file_error.txt','w')

# Process the training data into a spaCy-compatible DocBin and save it to disk
db = get_spacy_doc(file,train)
db.to_disk('/content/drive/MyDrive/ResumeParser/model/train_data.spacy') # Save the processed training data in 'train.spacy'

# Process the testing data into a spaCy-compatible DocBin and save it to disk
db = get_spacy_doc(file,test)
db.to_disk('/content/drive/MyDrive/ResumeParser/model/test_data.spacy')# Save the processed testing data in 'test.spacy'

# Close the error log file after processing
file.close()








100%|██████████| 709/709 [00:08<00:00, 80.16it/s]
100%|██████████| 305/305 [00:03<00:00, 98.81it/s]


### Train the model

In [17]:
!python -m spacy train /content/drive/MyDrive/ResumeParser/config/config.cfg --output ./content/drive/MyDrive/ResumeParser/model/output --paths.train ./content/drive/MyDrive/ResumeParser/model/train_data.spacy --paths.dev ./content/drive/MyDrive/ResumeParser/model/test_data.spacy --gpu-id 0

[38;5;2m✔ Created output directory:
content/drive/MyDrive/ResumeParser/model/output[0m
[38;5;4mℹ Saving to output directory:
content/drive/MyDrive/ResumeParser/model/output[0m
[38;5;4mℹ Using GPU: 0[0m
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/spacy/__main__.py", line 4, in <module>
    setup_cli()
  File "/usr/local/lib/python3.10/dist-packages/spacy/cli/_util.py", line 87, in setup_cli
    command(prog_name=COMMAND)
  File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/typer/core.py", line 743, in main
    return _main(
  File "/usr/local/lib/python3.10/dist-packages/typer/core.py", line 198, in _main
    rv = self.i

### Model Test

In [None]:
nlp = spacy.load('/content/output/model-best')


In [None]:
doc= nlp('my name is Ram. I worked as Microsoft. I have 2 years of experience in Machine Learning ')
for ent in doc.ents:
  print(ent.text, "     ",ent.label_)

In [None]:
! pip install pyMuPDF

In [None]:
import sys, fitz

In [None]:
fname = 'kdbvkjdb.pdf'
doc = fitz.open(fname)

NameError: name 'fitz' is not defined

In [None]:
text = " "
for page in doc:
  text = text + str (page.get_text())

In [None]:
text = text.strip()

In [None]:
text = ' '.join(text.split())

In [None]:
text

In [None]:
doc= nlp( text)
for ent in doc.ents:
  print(ent.text, "     ",ent.label_)