In [1]:
import os
import csv
import PyPDF2
import re



In [2]:
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() or ""
    return text.lower()



In [3]:
# Preprocess text to remove newlines and extra spaces
def preprocess_text(text):
    text = text.replace('\n', ' ').replace('  ', ' ')
    return text



In [4]:
# Identify the education level in the text
def find_education_level(text):
    patterns = {
        'bachelor': r'\b(bachelor|b\.s\.|bsc|b\.eng|b\.a\.)\b',
        'master': r'\b(master|m\.s\.|msc|m\.eng|m\.a\.)\b',
        'doctorate': r'\b(ph\.d\.|doctorate|dphil|doctoral)\b'
    }
    if re.search(patterns['doctorate'], text):
        return "Doctorate"
    elif re.search(patterns['master'], text):
        return "Master's"
    elif re.search(patterns['bachelor'], text):
        return "Bachelor's"
    else:
        return "No Bachelor's Degree"



In [5]:
# Function to count keywords in text
def extract_keywords(text, keywords):
    keyword_counts = {}
    for keyword in keywords:
        pattern = re.escape(keyword.lower())
        count = len(re.findall(rf'\b{pattern}\b', text))
        keyword_counts[keyword] = count
    return keyword_counts



In [6]:
# Process resumes in a directory and save to CSV
def process_resumes(directory, keywords, required_education, output_csv):
    results = []
    education_hierarchy = {
        "No Bachelor's Degree": 0,
        "Bachelor's": 1,
        "Master's": 2,
        "Doctorate": 3
    }

    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            raw_text = extract_text_from_pdf(pdf_path)
            processed_text = preprocess_text(raw_text)

            # Extract education level and keyword counts
            education_level = find_education_level(processed_text)
            keyword_counts = extract_keywords(processed_text, keywords)

            # Check if the candidate meets the education requirement
            education_status = (
                "Meets requirement"
                if education_hierarchy[education_level] >= education_hierarchy[required_education]
                else "Does not meet requirement"
            )

            # Add data to results list
            resume_data = {
                "Filename": filename,
                "Education Level": education_level,
                "Education Status": education_status,
            }
            resume_data.update(keyword_counts)
            results.append(resume_data)

    # Write results to CSV
    with open(output_csv, mode='w', newline='') as csvfile:
        fieldnames = ["Filename", "Education Level", "Education Status"] + keywords
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in results:
            writer.writerow(row)



In [7]:
# Parameters
directory = "res"  # Replace with your directory path
keywords = ['python', 'data', 'data structures', 'machine learning', 'data analysis', 'neural networks']
required_education = "Bachelor's"  # Set the required education level
output_csv = "resume_results.csv"

# Process resumes and save to CSV
process_resumes(directory, keywords, required_education, output_csv)
print(f"Resume information has been saved to {output_csv}.")


Resume information has been saved to resume_results.csv.
