In [1]:
# Install dependencies
!pip install -q sentence-transformers datasets nltk spacy pandas
!python -m spacy download en_core_web_sm

# --- IMPORTS ---
import pandas as pd
import numpy as np
import nltk
import spacy
from sentence_transformers import SentenceTransformer, util
from nltk.corpus import stopwords
import re
import pickle

# --- DOWNLOAD NLTK DATA ---
nltk.download('stopwords')
nltk.download('punkt')

# --- INITIALIZATIONS ---
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

# --- SKILL EXTRACTOR WITHOUT PREDEFINED SKILLS ---
class SkillExtractor:
    def __init__(self):
        pass

    def extract_skills(self, text):
        if not isinstance(text, str):
            return []

        text = text.lower()
        doc = nlp(text[:10000])  # Limit text length for faster processing

        # Extract nouns and proper nouns as potential skills
        skills = [token.text for token in doc if token.pos_ in ["NOUN", "PROPN"]
                  and token.text not in stop_words and len(token.text) > 1]

        # Filter duplicates and keep lowercase unique terms
        skills = list(set([s.lower() for s in skills]))

        return sorted(skills)

# --- MODEL ---
model = SentenceTransformer("all-MiniLM-L6-v2")
skill_extractor = SkillExtractor()

# --- EVALUATION FUNCTION ---
def evaluate_resume_and_job(resume_text, job_text):
    # --- Sentence embeddings ---
    resume_emb = model.encode(resume_text, convert_to_tensor=True)
    job_emb = model.encode(job_text, convert_to_tensor=True)
    sim_score = util.cos_sim(resume_emb, job_emb).item()

    # --- Match category ---
    if sim_score >= 0.7:
        match_category = "Good Match"
    elif sim_score >= 0.5:
        match_category = "Moderate Match"
    else:
        match_category = "Poor Match"

    # --- Skill extraction ---
    resume_skills = skill_extractor.extract_skills(resume_text)
    job_skills = skill_extractor.extract_skills(job_text)

    missing_skills = [s for s in job_skills if s not in resume_skills]

    skill_match_pct = (len([s for s in resume_skills if s in job_skills]) / len(job_skills)) * 100 if job_skills else 0

    # --- Output ---
    print("\n============== MATCH RESULTS ==============")
    print(f"🔎 Similarity Score: {sim_score:.2f}")
    print(f"📊 Match Category: {match_category}")
    print(f"✅ Skill Match %: {skill_match_pct:.2f}%")
    print(f"❌ Missing Skills: {missing_skills if missing_skills else 'None'}")
    print("\n💡 Recommendations:")
    if missing_skills:
        for skill in missing_skills[:5]:  # Suggest up to 5 missing
            print("-", f"Consider adding '{skill}' to your resume.")
    else:
        print("Your resume covers all required skills for this job!")

    # --- Print All Skills Clearly ---
    print("\n============== SKILLS EXTRACTED ==============")
    print(f"📝 All Resume Skills ({len(resume_skills)}):\n{resume_skills}")
    print(f"📌 All Job Description Skills ({len(job_skills)}):\n{job_skills}")

    # --- Return for optional saving ---
    return sim_score, skill_match_pct, missing_skills, resume_skills, job_skills

# --- SAMPLE DATA ---
sample_resume = """ Full-stack developer skilled in Mysql, TensorFlow, Java, Spring Boot, AWS, and React.
Experience with REST APIs and microservices. Led team projects integrating Docker and Kubernetes.
Proficient in agile development methodologies and cloud-based solutions.
Excellent communication skills and ability to work collaboratively.
Did projects using AWS bedrock, Sagemaker for pipeline and Kibana for monitoring through dashboards.
"""

sample_job = """ Looking for a Full-stack Engineer with experience in Java, Spring Boot, AWS Lambda, Docker, Kubernetes, and React.
Must be familiar with RESTful APIS, Agile workflows, and cloud platforms.
Need to have experience in SQL, like Mongodb and databricks, using PyTorch or Tensorflow.
"""

# --- RUN ---
sim_score, skill_match_pct, missing_skills, resume_skills, job_skills = evaluate_resume_and_job(sample_resume, sample_job)

# --- OPTIONAL: Save embeddings and skills ---
resume_emb = model.encode(sample_resume, convert_to_tensor=False)
job_emb = model.encode(sample_job, convert_to_tensor=False)

with open("resume_embedding.pkl", "wb") as f:
    pickle.dump(resume_emb, f)

with open("job_embedding.pkl", "wb") as f:
    pickle.dump(job_emb, f)

with open("resume_skills.pkl", "wb") as f:
    pickle.dump(resume_skills, f)

with open("job_skills.pkl", "wb") as f:
    pickle.dump(job_skills, f)

print("\n✅ Embeddings and skills saved for future use.")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.4 kB[0m [31m17.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


🔎 Similarity Score: 0.76
📊 Match Category: Good Match
✅ Skill Match %: 58.82%
❌ Missing Skills: ['databricks', 'engineer', 'lambda', 'mongodb', 'platforms', 'pytorch', 'workflows']

💡 Recommendations:
- Consider adding 'databricks' to your resume.
- Consider adding 'engineer' to your resume.
- Consider adding 'lambda' to your resume.
- Consider adding 'mongodb' to your resume.
- Consider adding 'platforms' to your resume.

📝 All Resume Skills (28):
['ability', 'apis', 'aws', 'bedrock', 'boot', 'cloud', 'communication', 'dashboards', 'developer', 'development', 'docker', 'experience', 'kibana', 'kubernetes', 'methodologies', 'microservices', 'mysql', 'pipeline', 'proficient', 'projects', 'rest', 'sagemaker', 'skills', 'solutions', 'spring', 'stack', 'team', 'tensorflow']
📌 All Job Description Skills (17):
['apis', 'aws', 'boot', 'cloud', 'databricks', 'docker', 'engineer', 'experience', 'kubernetes', 'lambda', 'mongodb', 'platforms', 'pytorch', 'spring', 'stack', 'tensorflow', 'workflo