In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import re
import pymupdf
from sentence_transformers import SentenceTransformer, util
import os
import spacy
from spacy.matcher import PhraseMatcher
import dotenv
import google.generativeai as genai
import openpyxl
from openpyxl.styles import Font
import pandas as pd
dotenv.load_dotenv()

In [None]:
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

In [None]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
SKILLS = [
    'python', 'docker', 'machine learning', 'fastapi', 'aws', 'nlp', 'tensorflow',
    'pytorch', 'scikit-learn', 'pandas', 'numpy', 'matplotlib', 'seaborn', 'sql',
    'nosql', 'mongodb', 'postgresql', 'redis', 'git', 'github', 'gitlab',
    'ci/cd', 'jenkins', 'travis ci', 'github actions', 'aws s3', 'aws ec2',
    'aws rds', 'aws lambda', 'aws dynamodb', 'kubernetes', 'docker-compose',
    'api development', 'restful apis', 'graphql', 'microservices',
    'data analysis', 'data visualization', 'big data', 'spark', 'hadoop',
    'kafka', 'etl', 'data warehousing', 'cloud computing', 'azure', 'gcp',
    'deep learning', 'neural networks', 'computer vision', 'object detection',
    'image processing', 'natural language processing', 'sentiment analysis',
    'topic modeling', 'transformers', 'bert', 'gpt', 'time series analysis',
    'statistical modeling', 'a/b testing', 'model deployment', 'mlops',
    'airflow', ' prefect', ' dvc', ' mlflow', ' unit testing', ' integration testing',
    'system design', 'agile', 'scrum', 'linux', 'bash', 'shell scripting',
    'virtualization', 'vmware', 'virtualbox', 'containerization', ' terraform',
    'ansible', ' puppet', ' chef', ' javascript', ' html', ' css', ' react', ' angular',
    'vue.js', 'node.js', 'django', 'flask', 'ruby on rails', ' java', ' c++', ' c#',
    'go', 'scala', ' r', ' excel', ' google sheets', ' tableau', ' power bi',
    ' spark streaming', ' hadoop hdfs', ' yarn', ' zookeeper', ' cassandra', ' neo4j',
    ' rabbitmq', ' aws eks', ' aws ecr', ' azure aks', ' gcp gke', ' openshift',
    ' helm', ' prometheus', ' grafana', ' elasticsearch', ' logstash', ' kibana',
    ' serverless', ' aws step functions', ' azure functions', ' gcp cloud functions',
    ' blockchain', ' smart contracts', ' solidity', ' web3', ' cybersecurity',
    ' network security', ' penetration testing', ' ethical hacking', ' cryptography',
    ' devops', ' site reliability engineering', ' sre', ' monitoring', ' logging',
    'incident response', 'disaster recovery', ' business intelligence',
    ' data mining', ' feature engineering', ' model evaluation', ' model selection',
    ' hyperparameter tuning', ' cross-validation', ' regularization', ' boosting',
    ' bagging', ' random forests', ' support vector machines', ' k-means',
    ' dbscan', ' hierarchical clustering', ' pca', ' t-sne', ' umap', ' recommender systems',
    ' reinforcement learning', ' gan', ' autoencoders', ' lstm', ' gru',
    ' convolutional neural networks', ' cnn', ' recurrent neural networks', ' rnn',
    ' transfer learning', ' fine-tuning', ' attention mechanisms', ' transformers',
    ' federated learning', ' differential privacy', ' explainable ai', ' xai',
    ' responsible ai', ' ai ethics', ' fairness', ' bias detection', ' bias mitigation',
    ' ai governance', ' regulatory compliance', ' gdpr', ' ccpa',
    ' project management', ' leadership', ' communication', ' collaboration',
    ' problem solving', ' critical thinking', ' adaptability', ' continuous learning',
    ' mentoring', ' technical writing', ' presentation skills'
]

In [None]:
def get_embedding(text):
    return model.encode(text, convert_to_tensor=True)

In [None]:
def extract_text_from_pdf(pdf_path: str) -> str:
    text = ""
    with pymupdf.open(pdf_path) as pdf:
        for page in pdf:
            text += page.get_text()
    return text

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def get_resume_embeddings(pdf_path):
    raw_text = extract_text_from_pdf(pdf_path)
    cleaned_text = clean_text(raw_text)
    embeddings = get_embedding(cleaned_text)
    return embeddings, cleaned_text


def extract_skills(text, skill_list):
    text = text.lower()
    return set(skill for skill in skill_list if skill in text)

In [None]:
# spaCy matcher setup
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp(skill) for skill in SKILLS]
matcher.add("SKILLS", patterns)

def extract_skills_with_matcher(text):
    doc = nlp(text.lower())
    matches = matcher(doc)
    return set(doc[start:end].text.lower() for _, start, end in matches)

def semantic_skill_match(text, skills, threshold=0.6):
    text_embedding = model.encode(text, convert_to_tensor=True)
    matched = set()
    for skill in skills:
        skill_embedding = model.encode(skill, convert_to_tensor=True)
        sim = util.cos_sim(text_embedding, skill_embedding).item()
        if sim > threshold:
            matched.add(skill)
    return matched

def extract_skills_combined(text, all_skills=SKILLS, threshold=0.6):
    exact_matches = extract_skills_with_matcher(text)
    unmatched_skills = set(all_skills) - exact_matches
    semantic_matches = semantic_skill_match(text, unmatched_skills, threshold)
    return exact_matches.union(semantic_matches)

In [None]:
job_description_text = """  Some job description here"""
job_skills = extract_skills(job_description_text, SKILLS)
job_description_embeddings = get_embedding(job_description_text)

SyntaxError: invalid syntax (1768666876.py, line 1)

In [None]:
resume_folder = "resumes/"
resume_files = [f for f in os.listdir(resume_folder) if f.endswith('.pdf')]

for resume_file in resume_files:
    resume_path = os.path.join(resume_folder, resume_file)
    resume_embeddings = get_resume_embeddings(resume_path)
    resume_skills = extract_skills(resume_embeddings[1], SKILLS)

    missing_skills = job_skills - resume_skills

    if missing_skills:
        feedback = f"Missing skills: {', '.join(missing_skills)}. Consider adding experience or projects related to them."
    else:
        feedback = f"Great! Your resume covers all the key skills for this job."

    similarity_score = util.pytorch_cos_sim(
        resume_embeddings[0], job_description_embeddings).item()

    print(
        f"Resume: {resume_file}, Similarity Score: {similarity_score:.4f}, Feedback: {feedback}")

In [None]:
def get_feedback_via_gemini(resume_text, job_text):
    prompt = f"""
You are an AI resume reviewer. The job description is:

{job_text}

The resume content is:

{resume_text}

Analyze how well the resume matches the job. What key skills or qualifications are missing? Suggest 2–3 improvements to make the resume a better match.

Keep it brief and useful."""

    model = genai.GenerativeModel("gemini-1.5-flash-latest")
    response = model.generate_content(prompt)

    return response.text

In [None]:
results = []

for resume_file in resume_files:
    resume_path = os.path.join(resume_folder, resume_file)
    resume_embeddings = get_resume_embeddings(resume_path)
    resume_skills = extract_skills(resume_embeddings[1], SKILLS)
    missing_skills = job_skills - resume_skills
    matched_skills = resume_skills & job_skills
    similarity_score = util.pytorch_cos_sim(
        resume_embeddings[0], job_description_embeddings).item()
    gemini_feedback = get_feedback_via_gemini(
        resume_embeddings[1], job_description_text)
    results.append({
        'candidate': resume_file,
        'match_score': round(similarity_score, 2),
        'skills_matched': (", ".join(matched_skills) if matched_skills else "None").title(),
        'skills_missing': (", ".join(missing_skills) if missing_skills else "None").title(),
        'feedback': gemini_feedback.replace('*', '')
    })

In [None]:
results

In [None]:

# Convert results to DataFrame and sort by match_score descending
df_results = pd.DataFrame(results)
df_results_sorted = df_results.sort_values(by='match_score', ascending=False)

In [None]:
df_results_sorted

In [None]:
def save_results_to_excel(results=results, filename="ranked_resumes.xlsx"):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Resume Ranking"

    # Headers
    headers = [
        "Candidate", "Match Score (%)", "Skills Matched", "Missing Skills", "LLM Feedback"]
    ws.append(headers)

    # Bold headers
    for cell in ws[1]:
        cell.font = Font(bold=True)

    # Data rows
    for res in results:
        ws.append([
            res["candidate"],
            round(res["match_score"] * 100, 2),
            res["skills_matched"],
            res["skills_missing"],
            res["feedback"]
        ])

    wb.save(filename)

In [None]:
save_results_to_excel(df_results_sorted.to_dict('records'))