In [3]:
import os
import PyPDF2
from docx import Document
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# NLTK setup

nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

# -----------------------------
# Helper Functions
# -----------------------------
def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = ""
    for para in doc.paragraphs:
        text += para.text + " "
    return text

def clean_text(text):
    words = text.lower().split()
    words = [w for w in words if w.isalpha() and w not in STOPWORDS]
    return " ".join(words)

def extract_skills(text, skill_set):
    text_words = set(text.lower().split())
    return [skill for skill in skill_set if skill.lower() in text_words]

# -----------------------------
# Resume Screening Function
# -----------------------------
def screen_resumes(resume_folder, job_description, skill_set):
    resumes = []
    texts = []

    # Extract text from resumes
    for file in os.listdir(resume_folder):
        path = os.path.join(resume_folder, file)
        if file.lower().endswith('.pdf'):
            text = extract_text_from_pdf(path)
        elif file.lower().endswith('.docx'):
            text = extract_text_from_docx(path)
        else:
            continue
        resumes.append(file)
        texts.append(clean_text(text))

    # Clean job description
    job_text = clean_text(job_description)

    # Vectorize and compute cosine similarity
    vectorizer = CountVectorizer().fit([job_text] + texts)
    job_vec = vectorizer.transform([job_text])
    resume_vecs = vectorizer.transform(texts)
    similarity_scores = cosine_similarity(resume_vecs, job_vec).flatten()

    # Extract skills
    skills_in_resumes = [extract_skills(t, skill_set) for t in texts]
    skills_in_job = extract_skills(job_text, skill_set)

    # Prepare results
    results = []
    for i, resume in enumerate(resumes):
        matched_skills = list(set(skills_in_resumes[i]) & set(skills_in_job))
        score = similarity_scores[i]
        results.append({
            "Resume": resume,
            "Similarity Score": round(score, 2),
            "Matched Skills": ", ".join(matched_skills)
        })

    # Rank candidates
    results = sorted(results, key=lambda x: x["Similarity Score"], reverse=True)
    return results

# Export to Excel

def export_to_excel(results, output_file="resume_ranking.xlsx"):
    df = pd.DataFrame(results)
    df.to_excel(output_file, index=False)
    print(f"Results exported to {output_file}")

# Main Execution

if __name__ == "__main__":
    job_description = """
    We are looking for a Python developer with experience in data analysis, 
    machine learning, Flask, Django, and SQL.
    """

    skill_set = [
        "Python", "SQL", "Machine Learning", "Data Analysis",
        "Flask", "Django", "Pandas", "NumPy", "TensorFlow",
        "JavaScript", "HTML", "CSS", "Communication"
    ]

    # Provide the path to your folder containing resumes
    resume_folder = input("Enter the path to the folder containing resumes: ").strip()

    if not os.path.exists(resume_folder):
        print("Error: Folder does not exist!")
    else:
        results = screen_resumes(resume_folder, job_description, skill_set)
        
        print("\nRanked Candidates:")
        for idx, res in enumerate(results, 1):
            print(f"{idx}. {res['Resume']} | Score: {res['Similarity Score']} | Matched Skills: {res['Matched Skills']}")

        # Export results to Excel
        export_to_excel(results)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kusha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Ranked Candidates:
1. John Doe - Professional Resume.pdf | Score: 0.47 | Matched Skills: Python
Results exported to resume_ranking.xlsx
