In [None]:
# Install spaCy (only first time in Colab)
!pip install spacy

# Download small English model
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Imports
import spacy
import nltk
import json
import glob, os

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Setup
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def preprocess_text(text):
    """Tokenize, clean, and lemmatize text."""
    tokens = word_tokenize(text.lower())
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered]
    return lemmatized


In [None]:
def extract_entities(text):
    """Extract named entities like skills, orgs, tools using spaCy."""
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "SKILL", "GPE"]]
    return list(set(entities))


In [None]:
def match_resume_jd(resume_text, jd_text, resume_name="resume.txt", jd_name="job.txt"):
    # Preprocess both texts
    resume_tokens = preprocess_text(resume_text)
    jd_tokens = preprocess_text(jd_text)

    # Overlap in keywords
    overlap = set(resume_tokens).intersection(set(jd_tokens))

    # TF-IDF similarity
    vectorizer = TfidfVectorizer().fit([resume_text, jd_text])
    tfidf_score = cosine_similarity(vectorizer.transform([resume_text]), vectorizer.transform([jd_text]))[0][0]

    # NER entities
    resume_entities = extract_entities(resume_text)
    jd_entities = extract_entities(jd_text)
    entity_overlap = set(resume_entities).intersection(set(jd_entities))

    # Combined score
    final_score = (tfidf_score * 0.7) + (len(overlap) * 0.2) + (len(entity_overlap) * 0.1)

    return {
        "resume": resume_name,
        "job": jd_name,
        "tfidf_score": round(tfidf_score, 3),
        "keyword_overlap": list(overlap),
        "entity_overlap": list(entity_overlap),
        "final_score": round(final_score, 3)
    }


In [None]:
resumes = [
    "I am a software engineer skilled in Python, JavaScript, and data analysis. I have experience with machine learning, AWS cloud, and Docker.",
    "Experienced front-end developer with expertise in React, HTML, CSS, and modern UI design. Worked on multiple web applications with Figma.",
    "Data scientist with knowledge in SQL, Python, TensorFlow, and natural language processing. Strong background in statistics and cloud deployment.",
    "Backend engineer experienced in Node.js, Express, and MongoDB. Worked with REST APIs and scalable systems.",
    "AI researcher with experience in computer vision, PyTorch, NLP, and GANs. Published research papers in IEEE conferences."
]

job_descriptions = [
    "We are looking for a Python developer with experience in machine learning, cloud technologies like AWS, and Docker.",
    "Hiring a front-end engineer skilled in React, CSS, and Figma to build interactive web applications.",
    "Seeking a data scientist with SQL, NLP, TensorFlow, and strong statistics background.",
    "Backend developer required with Node.js, Express, and MongoDB expertise.",
    "AI research position requiring knowledge of PyTorch, computer vision, and GANs."
]


In [None]:
import nltk
nltk.download('punkt_tab')
results = []
for i, resume in enumerate(resumes):
    for j, jd in enumerate(job_descriptions):
        result = match_resume_jd(resume, jd, resume_name=f"resume_{i+1}.txt", jd_name=f"job_{j+1}.txt")
        results.append(result)

print(json.dumps(results, indent=2))


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


[
  {
    "resume": "resume_1.txt",
    "job": "job_1.txt",
    "tfidf_score": 0.42,
    "keyword_overlap": [
      "python",
      "cloud",
      "machine",
      "experience",
      "learning",
      "aws",
      "docker"
    ],
    "entity_overlap": [
      "AWS"
    ],
    "final_score": 1.794
  },
  {
    "resume": "resume_1.txt",
    "job": "job_2.txt",
    "tfidf_score": 0.167,
    "keyword_overlap": [
      "engineer",
      "skilled"
    ],
    "entity_overlap": [],
    "final_score": 0.517
  },
  {
    "resume": "resume_1.txt",
    "job": "job_3.txt",
    "tfidf_score": 0.155,
    "keyword_overlap": [
      "data"
    ],
    "entity_overlap": [],
    "final_score": 0.308
  },
  {
    "resume": "resume_1.txt",
    "job": "job_4.txt",
    "tfidf_score": 0.118,
    "keyword_overlap": [],
    "entity_overlap": [],
    "final_score": 0.082
  },
  {
    "resume": "resume_1.txt",
    "job": "job_5.txt",
    "tfidf_score": 0.072,
    "keyword_overlap": [],
    "entity_overlap": [],
 

In [None]:
best_matches = []
for j, jd in enumerate(job_descriptions):
    best_score = -1
    best_result = None
    for i, resume in enumerate(resumes):
        result = match_resume_jd(resume, jd, resume_name=f"resume_{i+1}.txt", jd_name=f"job_{j+1}.txt")
        if result["final_score"] > best_score:
            best_score = result["final_score"]
            best_result = result
    best_matches.append(best_result)

print("\nBest Resume for Each JD:\n")
print(json.dumps(best_matches, indent=2))



Best Resume for Each JD:

[
  {
    "resume": "resume_1.txt",
    "job": "job_1.txt",
    "tfidf_score": 0.42,
    "keyword_overlap": [
      "python",
      "cloud",
      "machine",
      "experience",
      "learning",
      "aws",
      "docker"
    ],
    "entity_overlap": [
      "AWS"
    ],
    "final_score": 1.794
  },
  {
    "resume": "resume_2.txt",
    "job": "job_2.txt",
    "tfidf_score": 0.326,
    "keyword_overlap": [
      "application",
      "react",
      "figma",
      "cs",
      "web"
    ],
    "entity_overlap": [
      "Figma",
      "React",
      "CSS"
    ],
    "final_score": 1.528
  },
  {
    "resume": "resume_3.txt",
    "job": "job_3.txt",
    "tfidf_score": 0.478,
    "keyword_overlap": [
      "tensorflow",
      "statistic",
      "strong",
      "background",
      "data",
      "scientist",
      "sql"
    ],
    "entity_overlap": [
      "TensorFlow",
      "SQL"
    ],
    "final_score": 1.935
  },
  {
    "resume": "resume_4.txt",
    "job": "