## Env setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# https://spacy.io/usage/
!pip install spacy_transformers
!pip install -U spacy

!pip install pdfminer-six

Collecting spacy_transformers
  Downloading spacy_transformers-1.3.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.50.0,>=3.4.0 (from spacy_transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy_transformers)
  Downloading spacy_alignments-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->spacy_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->spacy_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from t

In [3]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [4]:
!nvidia-smi

Mon Apr  7 05:57:44 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
spacy.__version__

'3.8.5'

## loading data

In [6]:
data = json.load(open('/content/drive/MyDrive/resume-parser/data/dataset.json', 'r'))

In [7]:
len(data)

1014

In [8]:
data[0]

['\xa0 \xa0\nContact\nwww.linkedin.com/in/omjagri\n(LinkedIn)\nTop Skills\nphp\nMySQL\nJavaScript\nCertifications\nPhp & Js Om Prakash Jagri\nFull Stack Developer | PHP | Laravel | Vue Js\nKathmandu, Bāgmatī, Nepal\nSummary\nExperienced Developer with a demonstrated history of working in\nthe information technology and services industry. Skilled in Laravel,\nPHP, Cascading Style Sheets (CSS), JavaScript, vue js and MySQL.\nStrong engineering professional with a B.sc.CSIT(Bachelors of\nScience in Computer Science and Information Technology) focused\nin Computer Science from Tribhuvan University, Institute of Science\n& Tchnology. \nExperience\nSearchable Design LLC\nSoftware Developer\nJune 2021\xa0-\xa0Present\xa0 (1 year 7 months)\nNepal\nFull Stack Developer Laravel with Vue Js\nBenekiva\nTechnical Documentation\nSeptember 2021\xa0-\xa0Present\xa0 (1 year 4 months)\nUnited States\nBidhee\n3 years 9 months\nLaravel Developer\nMarch 2018\xa0-\xa0May 2021\xa0 (3 years 3 months)\nBaneswa

In [9]:
!python -m spacy init fill-config /content/drive/MyDrive/resume-parser/data/base_config.cfg /content/drive/MyDrive/resume-parser/data/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/resume-parser/data/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


## training model

In [10]:
def get_spacy_doc(data):
  nlp = spacy.blank('en')
  db = DocBin()

  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    for start, end, label in annot: # check for overlap of indeies
        skip_entity = False
        for i in range(start, end):
            if i in entity_indices:
                skip_entity = True
                break

        if skip_entity:
            continue

        entity_indices = entity_indices + list(range(start, end))

        try:
            span = doc.char_span(start, end, label=label, alignment_mode="strict")

        except:
            continue

        if span is None:
            continue
        else:
            ents.append(span)

    try:
        doc.ents = ents
        db.add(doc)

    except:
        pass


  return db


In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2)

In [12]:
len(train), len(test)

(811, 203)

In [13]:
train_db = get_spacy_doc(train)
train_db.to_disk('/content/drive/MyDrive/resume-parser/data/train.spacy')

test_db = get_spacy_doc(test)
test_db.to_disk('/content/drive/MyDrive/resume-parser/data/test.spacy')


100%|██████████| 811/811 [00:07<00:00, 114.81it/s]
100%|██████████| 203/203 [00:01<00:00, 123.93it/s]


In [14]:
train_db.tokens, test_db.tokens

([array([[  682504032995010192,   682504032995010192,                    0,
          ...,                    0,                    0,
                             0],
         [ 7872030601856903690,   786451873915755474,                    0,
          ...,                    0,                    0,
                             0],
         [  962983613142996970,   962983613142996970,                    0,
          ...,                    0,                    0,
                             0],
         ...,
         [15180167692696242062, 15180167692696242062,                    0,
          ...,                    0,                    0,
                             0],
         [  886050111519832510,   886050111519832510,                    0,
          ...,                    0,                    0,
                             0],
         [15180167692696242062, 15180167692696242062,                    0,
          ...,                    0,                    0,
           

In [15]:
!python -m spacy \
    train /content/drive/MyDrive/resume-parser/data/config.cfg \
    --output /content/drive/MyDrive/resume-parser/output \
    --paths.train /content/drive/MyDrive/resume-parser/data/train.spacy \
    --paths.dev /content/drive/MyDrive/resume-parser/data/test.spacy \
    --gpu-id 0

[38;5;2m✔ Created output directory:
content/drive/MyDrive/resume-parser/output[0m
[38;5;4mℹ Saving to output directory:
content/drive/MyDrive/resume-parser/output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
tokenizer_config.json: 100% 25.0/25.0 [00:00<00:00, 188kB/s]
config.json: 100% 481/481 [00:00<00:00, 4.80MB/s]
vocab.json: 100% 899k/899k [00:00<00:00, 13.1MB/s]
merges.txt: 100% 456k/456k [00:00<00:00, 14.3MB/s]
tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 2.10MB/s]
2025-04-07 06:00:48.394184: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744005648.644280    2133 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744005648.711638    2133 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS w

## test


In [35]:
model_path = "/content/drive/MyDrive/resume-parser/output/model-best"

In [36]:
import spacy
nlp = spacy.load(model_path)

In [37]:
resume_path = "/content/drive/MyDrive/resume-parser/test/resume/resume1.pdf"
jd_path = '/content/drive/MyDrive/resume-parser/test/job-description/jd1.txt'

In [38]:
import re
from pdfminer.high_level import extract_text

text = extract_text(resume_path)
text



'Karnam Shyam\n\nStudent, Aspiring Full-Stack Developer Powered by AI/ML Insights\n9346872174 | karnam.shyam2004@gmail.com | @linkedin | @github | @leetcode | @hackerrank\n\nSummary\n\nExperienced and results-driven professional with a strong background in full-stack Java development and a\npassion for leveraging Python for AI, ML, and deep learning applications. Proven track record demonstrated\nthrough research papers showcasing expertise in these areas. I am constantly seeking opportunities to expand my\nknowledge and skills through internships, workshops, and personal projects. I possess strong logical thinking and\nproblem-solving abilities. I believe in continuous learning and strive to stay updated with the latest industry\ntrends.\n\nTechnical Skills\n\nLanguages: Python, Java, C, HTML, CSS, JavaScript, SQL(MySQL, SQLite, Oracle), MongoDB\nFrameworks: Flask, SpringBoot, Angular16, Bootstrap, Hibernate\nDeveloper Tools: Git, Firebase, Maven\nLibraries: Tensorflow, sklearn, numpy

In [39]:
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, " ->>> " ,ent.label_)

Karnam Shyam  ->>>  NAME
Python  ->>>  SKILLS
Java  ->>>  SKILLS
HTML  ->>>  SKILLS
CSS  ->>>  SKILLS
MongoDB  ->>>  SKILLS
Flask  ->>>  SKILLS
SpringBoot  ->>>  SKILLS
Angular16  ->>>  SKILLS
Bootstrap  ->>>  SKILLS
Bird Species Identification  ->>>  WORKED AS
Flask  ->>>  SKILLS
VIT-AP University  ->>>  UNIVERSITY
Vizag, AP  ->>>  WORKED AS
Vizag, AP  ->>>  WORKED AS


In [40]:
resp = {}
for ent in doc.ents:
    if resp.get(ent.label_) is None:
        resp[ent.label_] = [ent.text]
    else:
        resp[ent.label_].append(ent.text)

resp

{'NAME': ['Karnam Shyam'],
 'SKILLS': ['Python',
  'Java',
  'HTML',
  'CSS',
  'MongoDB',
  'Flask',
  'SpringBoot',
  'Angular16',
  'Bootstrap',
  'Flask'],
 'WORKED AS': ['Bird Species Identification', 'Vizag, AP', 'Vizag, AP'],
 'UNIVERSITY': ['VIT-AP University']}

In [41]:
import re

def extract_contact_number_from_resume(text):
    contact_number = None

    # Use regex pattern to find a potential contact number
    pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
    match = re.search(pattern, text)
    if match:
        contact_number = match.group()

    return contact_number

phone = extract_contact_number_from_resume(text)
phone

'9346872174'

In [42]:
import re

def extract_email_from_resume(text):
    email = None

    # Use regex pattern to find a potential email address
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    match = re.search(pattern, text)
    if match:
        email = match.group()

    return email

email = extract_email_from_resume(text)
email

'karnam.shyam2004@gmail.com'

In [43]:
jd = open(jd_path, 'r').read()
jd

"Python LLM Data Scientist\n\nA US-based AI client looking for a Python LLM Data Scientist/Analyst.\n\nThey're looking for someone to take on the responsibility of guiding peer code reviews, nurturing an atmosphere where constructive feedback is exchanged to refine our code and foster professional growth.\n\nYour Role:\n\nBuild Python codes for big challenges that can grow with us.\nDig into free data out there and find cool insights.\nWork side-by-side with our researchers to hit our goals.\nUse data to crack tough work-related questions.\n\nYou Should Have:\n\nA degree in stuff like Engineering or Computer Science, or you're just really good at this.\nMust have worked in data science or analysis for 1 year.\nBeen coding with Python for as a professional.\nSolid skills in data science and analysis.\nReally good English, talking and writing.\n\nNice to have:\n\nIf you know SQL, that's a plus.\nSuper good at talking and teaming up with people.\nThe ability to think through data and make

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [45]:
def tfidf_matcher(job_description, resumes):
    vectorizer = TfidfVectorizer().fit_transform([job_description, resumes])
    vectors = vectorizer.toarray()

    job_vector = vectors[0]
    resume_vectors = vectors[1:]
    similarities = cosine_similarity([job_vector], resume_vectors)[0]
    print(similarities)

    return similarities

In [46]:
def count_matcher(job_description, resumes):
    vectorizer = CountVectorizer().fit_transform([job_description, resumes])
    vectors = vectorizer.toarray()

    job_vector = vectors[0]
    resume_vectors = vectors[1:]
    similarities = cosine_similarity([job_vector], resume_vectors)[0]
    print(similarities)

    return similarities

In [47]:
tfidf_matcher(jd, text)

[0.21611968]


array([0.21611968])

In [48]:
count_matcher(jd, text)

[0.33672777]


array([0.33672777])