In [11]:
!pip install nltk==3.8.1


Collecting nltk==3.8.1
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.5 MB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.5 MB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.5/1.5 MB[0m [31m17.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.9.1
    Uninstalling nltk-3.9.1:
      Successfully uninstalled nltk-3.9.1
[31mERROR: pip's dependency resolver does not currently tak

In [1]:
# NLTK fallback
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
# CELL 2 — imports and helper functions
import re
import json
from collections import Counter

# NLP libraries
import spacy
from spacy.matcher import PhraseMatcher

# sklearn for TF-IDF + similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Load spaCy model (we'll try/catch)
try:
    nlp = spacy.load("en_core_web_sm")
    SPACY_OK = True
except Exception as e:
    print("spaCy model not available.", e)
    nlp = None
    SPACY_OK = False

In [3]:
EN_STOPWORDS = set(stopwords.words('english'))
N_LEMMATIZER = WordNetLemmatizer()

In [4]:
# CELL 3 — cleaning + lemmatization (spaCy primary, NLTK fallback)

def clean_text(text: str) -> str:
    """Light cleaning: remove emails/urls, keep alphanumerics, allow + and # (for C++ / C#)."""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\S+@\S+', ' ', text)          # remove emails
    text = re.sub(r'http\S+', ' ', text)          # remove urls
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9\+\#\.\- ]+', ' ', text)  # allow +,#,.,-
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def lemmatize_tokens(text: str, use_spacy: bool = True) -> list:
    """Return lowercased, lemmatized tokens with stopwords removed."""
    text = clean_text(text)
    if use_spacy and SPACY_OK:
        doc = nlp(text)
        tokens = [token.lemma_.lower() for token in doc
                  if token.is_alpha and not token.is_stop]
        return tokens
    # Fallback: NLTK
    tokens = word_tokenize(text)
    tokens = [t.lower() for t in tokens if t.isalpha() and t.lower() not in EN_STOPWORDS]
    lem = [N_LEMMATIZER.lemmatize(t) for t in tokens]
    return lem


In [5]:
# CELL 4 — extract named entities and noun-chunks (useful to detect company names, technologies, etc.)

def extract_entities(text: str):
    """Return spaCy entities as list of (text, label). If spaCy not available, returns empty list."""
    if not SPACY_OK:
        return []
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

def extract_noun_phrases(text: str):
    """Return noun-chunks / proper nouns (good for multi-word skills like 'machine learning')."""
    if SPACY_OK:
        doc = nlp(text)
        chunks = [chunk.text.lower().strip() for chunk in doc.noun_chunks]
        # also include PROPN tokens (single-word proper nouns)
        props = [token.text.lower() for token in doc if token.pos_ == "PROPN"]
        return list(set(chunks + props))
    # Fallback: simple heuristic - return lowercased bigrams/trigrams from tokens
    tokens = [t.lower() for t in word_tokenize(clean_text(text)) if t.isalpha()]
    bigrams = [" ".join(tokens[i:i+2]) for i in range(len(tokens)-1)]
    trigrams = [" ".join(tokens[i:i+3]) for i in range(len(tokens)-2)]
    return list(set(bigrams + trigrams))


In [6]:
# CELL 5 — prepare a sample skill list and a phrase matcher for robust skill extraction

# --------- SAMPLE skill list (expand this into a master file or DB later) ----------
skills_master = [
    "python", "java", "javascript", "react", "node.js", "node", "express",
    "flask", "django", "fastapi", "sql", "postgresql", "mysql",
    "mongodb", "aws", "azure", "google cloud", "docker", "kubernetes",
    "machine learning", "deep learning", "nlp", "tensorflow", "pytorch",
    "git", "rest api", "restful api", "html", "css", "typescript", "c++", "c#"
]
# normalize skills for phrase matcher
skills_master = list(dict.fromkeys(sk.lower() for sk in skills_master))  # dedupe & lower

# If spaCy is available make a PhraseMatcher for exact phrase matches (fast + accurate)
if SPACY_OK:
    matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    patterns = [nlp.make_doc(s) for s in skills_master]
    matcher.add("SKILLS", patterns)
else:
    matcher = None

def extract_skills(text: str, skills_list=skills_master) -> list:
    """Return matched skills from text using spaCy PhraseMatcher or substring fallback."""
    text_clean = clean_text(text).lower()
    found = set()
    if SPACY_OK and matcher is not None:
        doc = nlp(text_clean)
        matches = matcher(doc)
        for match_id, start, end in matches:
            found.add(doc[start:end].text.lower())
        return sorted(found)
    # Fallback: simple substring matching (order by longest match)
    for skill in sorted(skills_list, key=lambda s: -len(s)):
        if skill in text_clean:
            found.add(skill)
    return sorted(found)


In [7]:
# CELL 6 — TF-IDF similarity (same idea as Day 1)
def tfidf_similarity(text_a: str, text_b: str) -> float:
    vec = TfidfVectorizer().fit([text_a, text_b])
    tfidf = vec.transform([text_a, text_b])
    sim = cosine_similarity(tfidf[0], tfidf[1])[0,0]
    return float(sim)

# OPTIONAL: sentence-transformers embeddings (for Day 3 semantic matching)
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('all-MiniLM-L6-v2')
# def embed_similarity(a,b):
#     a_emb = model.encode([a])
#     b_emb = model.encode([b])
#     return cosine_similarity(a_emb, b_emb)[0,0]


In [8]:
# CELL 7 — main function to produce structured JSON result for a resume vs JD pair

def match_resume_jd(resume_text: str, jd_text: str, skills_list=skills_master, use_spacy_lemmatize=True):
    # Clean / lemmatize
    cleaned_resume = clean_text(resume_text)
    cleaned_jd = clean_text(jd_text)
    resume_tokens = lemmatize_tokens(cleaned_resume, use_spacy=use_spacy_lemmatize)
    jd_tokens = lemmatize_tokens(cleaned_jd, use_spacy=use_spacy_lemmatize)

    # Skill extraction
    skills_resume = set(extract_skills(resume_text, skills_list))
    skills_jd = set(extract_skills(jd_text, skills_list))
    matched_skills = sorted(list(skills_resume & skills_jd))
    missing_skills = sorted(list(skills_jd - skills_resume))  # skills JD expects but resume missing

    # Entity extraction
    entities_resume = extract_entities(resume_text)
    entities_jd = extract_entities(jd_text)

    # Similarities
    tfidf_score = tfidf_similarity(cleaned_resume, cleaned_jd)

    result = {
        "matched_skills": matched_skills,
        "missing_skills": missing_skills,
        "skills_found_in_resume": sorted(list(skills_resume)),
        "skills_found_in_jd": sorted(list(skills_jd)),
        "tfidf_score": round(tfidf_score, 4),
        "entities_resume": entities_resume,
        "entities_jd": entities_jd,
        "lemmatized_resume_tokens_preview": resume_tokens[:60],
        "lemmatized_jd_tokens_preview": jd_tokens[:60],
    }
    return result

# Save results to JSON utility
def save_result_json(result: dict, filename: str):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)
    print(f"Saved: {filename}")


In [9]:
# CELL 8 — Example usage
sample_resume = """
Experienced backend engineer. Built RESTful APIs with Flask and Django, worked with PostgreSQL and MongoDB.
Familiar with Docker and AWS. Good knowledge of Python, unit testing, and Git.
"""

sample_jd = """
Hiring Backend Developer: must have Python experience (Flask or FastAPI), PostgreSQL or MySQL,
containerization with Docker, cloud experience (AWS preferred). Experience in Kubernetes is a plus.
"""

res = match_resume_jd(sample_resume, sample_jd)
print(json.dumps(res, indent=2))

# Save result
save_result_json(res, "sample_resume1_vs_jd1.json")


{
  "matched_skills": [
    "aws",
    "docker",
    "flask",
    "postgresql",
    "python"
  ],
  "missing_skills": [
    "fastapi",
    "kubernetes",
    "mysql"
  ],
  "skills_found_in_resume": [
    "aws",
    "django",
    "docker",
    "flask",
    "git",
    "mongodb",
    "postgresql",
    "python"
  ],
  "skills_found_in_jd": [
    "aws",
    "docker",
    "fastapi",
    "flask",
    "kubernetes",
    "mysql",
    "postgresql",
    "python"
  ],
  "tfidf_score": 0.1349,
  "entities_resume": [
    [
      "Flask",
      "GPE"
    ],
    [
      "Django",
      "GPE"
    ],
    [
      "PostgreSQL",
      "GPE"
    ],
    [
      "Docker",
      "PERSON"
    ],
    [
      "AWS",
      "ORG"
    ],
    [
      "Python",
      "ORG"
    ],
    [
      "Git",
      "PERSON"
    ]
  ],
  "entities_jd": [
    [
      "Flask",
      "PRODUCT"
    ],
    [
      "PostgreSQL",
      "GPE"
    ],
    [
      "Docker",
      "PERSON"
    ],
    [
      "AWS",
      "ORG"
    ],
    [
  

In [11]:
# CELL 9 — batch processing (assumes .txt files in folders ./resumes and ./jds)
import os, glob

def batch_match(resume_dir="./resumes", jd_dir="./jds", out_dir="./results"):
    os.makedirs(out_dir, exist_ok=True)
    resume_files = sorted(glob.glob(os.path.join(resume_dir, "*.txt")))
    jd_files = sorted(glob.glob(os.path.join(jd_dir, "*.txt")))

    for rfile in resume_files:
        rtext = open(rfile, encoding="utf-8").read()
        for jfile in jd_files:
            jtext = open(jfile, encoding="utf-8").read()
            result = match_resume_jd(rtext, jtext)
            outname = os.path.join(out_dir, f"{os.path.basename(rfile)}__{os.path.basename(jfile)}.json")
            save_result_json(result, outname)

# Example: create ./resumes and ./jds and run:
# batch_match()


In [12]:
def match_resume_jd(resume_text, jd_text):
    resume_tokens = set(preprocess_text(resume_text))
    jd_tokens = set(preprocess_text(jd_text))
    overlap = resume_tokens.intersection(jd_tokens)
    return {
        "matched_skills": list(overlap),
        "score": len(overlap)
    }

def save_result_json(result, filepath, resume_name=None, jd_name=None):
    if resume_name: result["resume"] = resume_name
    if jd_name: result["job"] = jd_name
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(result, f, indent=4)


In [13]:
def batch_match(resume_dir="./resumes", jd_dir="./jds", out_dir="./results"):
    os.makedirs(out_dir, exist_ok=True)
    resume_files = sorted(glob.glob(os.path.join(resume_dir, "*.txt")))
    jd_files = sorted(glob.glob(os.path.join(jd_dir, "*.txt")))

    for rfile in resume_files:
        rtext = open(rfile, encoding="utf-8").read()
        for jfile in jd_files:
            jtext = open(jfile, encoding="utf-8").read()
            result = match_resume_jd(rtext, jtext)
            outname = os.path.join(out_dir, f"{os.path.basename(rfile)}__{os.path.basename(jfile)}.json")
            save_result_json(result, outname, resume_name=os.path.basename(rfile), jd_name=os.path.basename(jfile))

# Example: make folders ./resumes and ./jds, drop .txt files inside, then run:
# batch_match()


In [15]:
# Sample resumes and job descriptions (test cases)
resumes = [
    "I am a software engineer skilled in Python, JavaScript, and data analysis. I have experience with machine learning and cloud computing.",
    "Experienced front-end developer with expertise in React, HTML, CSS, and modern UI design. Worked on multiple web applications.",
    "Data scientist with knowledge in SQL, Python, TensorFlow, and natural language processing. Strong background in statistics."
]

job_descriptions = [
    "We are looking for a Python developer with experience in machine learning and cloud technologies.",
    "Hiring a front-end engineer skilled in React, CSS, and JavaScript to build interactive web applications.",
    "Seeking a data scientist with SQL, NLP, and deep learning experience."
]


In [16]:
# Simulated batch matching with our Day 2 test cases
for i, resume in enumerate(resumes):
    for j, jd in enumerate(job_descriptions):
        result = match_resume_jd(resume, jd)
        result["resume"] = f"resume_{i+1}.txt"
        result["job"] = f"job_{j+1}.txt"
        print(json.dumps(result, indent=4))


NameError: name 'preprocess_text' is not defined