In [2]:
import os
import csv
import PyPDF2
import json
import re
from collections import defaultdict


In [3]:

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() or ""
    return text.lower()


In [4]:

# Preprocess text to remove newlines and extra spaces
def preprocess_text(text):
    text = text.replace('\n', ' ').replace('  ', ' ')
    return text.lower()


In [5]:

# Identify the education level in the text
def find_education_level(text):
    patterns = {
        'bachelor': r'\b(bachelor|b\.s\.|bsc|b\.eng|b\.a\.)\b',
        'master': r'\b(master|m\.s\.|msc|m\.eng|m\.a\.)\b',
        'doctorate': r'\b(ph\.d\.|doctorate|dphil|doctoral)\b'
    }
    if re.search(patterns['doctorate'], text):
        return "Doctorate"
    elif re.search(patterns['master'], text):
        return "Master's"
    elif re.search(patterns['bachelor'], text):
        return "Bachelor's"
    else:
        return "No Bachelor's Degree"


In [6]:

# Function to count keywords in text
def extract_keywords(text, keywords):
    keyword_counts = {}
    for keyword in keywords:
        pattern = re.escape(keyword.lower())
        count = len(re.findall(rf'\b{pattern}\b', text))
        keyword_counts[keyword] = count
    return keyword_counts


In [7]:

# Function to determine probable industry based on keywords
def match_industry(text, industry_keywords):
    industry_scores = defaultdict(int)
    for industry, keywords in industry_keywords.items():
        for keyword in keywords:
            pattern = re.escape(keyword.lower())
            matches = len(re.findall(rf'\b{pattern}\b', text))
            industry_scores[industry] += matches
    # Find the industry with the highest score
    probable_industry = max(industry_scores, key=industry_scores.get)
    return probable_industry if industry_scores[probable_industry] > 0 else "Unknown"


In [8]:

# Process resumes in a directory and save to CSV
def process_resumes(directory, keywords, required_education, industry_keywords, output_csv):
    results = []
    education_hierarchy = {
        "No Bachelor's Degree": 0,
        "Bachelor's": 1,
        "Master's": 2,
        "Doctorate": 3
    }

    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            raw_text = extract_text_from_pdf(pdf_path)
            processed_text = preprocess_text(raw_text)

            # Extract education level and keyword counts
            education_level = find_education_level(processed_text)
            keyword_counts = extract_keywords(processed_text, keywords)
            
            # Determine probable industry
            probable_industry = match_industry(processed_text, industry_keywords)

            # Check if the candidate meets the education requirement
            education_status = (
                "Meets requirement"
                if education_hierarchy[education_level] >= education_hierarchy[required_education]
                else "Does not meet requirement"
            )

            # Add data to results list
            resume_data = {
                "Filename": filename,
                "Education Level": education_level,
                "Education Status": education_status,
                "Industry": probable_industry,
            }
            resume_data.update(keyword_counts)
            results.append(resume_data)

    # Write results to CSV
    with open(output_csv, mode='w', newline='') as csvfile:
        fieldnames = ["Filename", "Education Level", "Education Status", "Industry"] + keywords
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in results:
            writer.writerow(row)


In [9]:

# Parameters
directory = "res"  # Replace with your directory path
keywords = ['python', 'data', 'data structures', 'machine learning', 'data analysis', 'neural networks']
required_education = "Bachelor's"  # Set the required education level
output_csv = "resume_results.csv"


In [10]:

# Define industry keywords for matching
industry_keywords = {
    "Software Development": [
        "software development", "programming", "application development", "coding", "debugging",
        "software engineering", "full-stack development", "front-end", "back-end", "API", 
        "web development", "mobile development", "Java", "Python", "JavaScript", "C++", "C#", 
        ".NET", "React", "Angular", "Node.js", "HTML", "CSS", "Agile", "Scrum", "version control"
    ],
    "Data Science and Analytics": [
        "data science", "machine learning", "data analysis", "statistical modeling", "data visualization", 
        "Python", "R", "SQL", "statistics", "predictive modeling", "data mining", "big data", 
        "data engineering", "deep learning", "natural language processing", "clustering", "classification", 
        "regression", "Tableau", "Power BI", "NumPy", "Pandas", "TensorFlow", "Keras", "scikit-learn", "PyTorch", 
        "data wrangling"
    ],
    "Finance": [
        "finance", "investment", "portfolio management", "financial analysis", "accounting", "budgeting", 
        "forecasting", "financial modeling", "risk management", "valuation", "corporate finance", "M&A", 
        "private equity", "venture capital", "hedge fund", "investment banking", "equity research", 
        "financial planning", "audit", "GAAP", "IFRS", "taxation", "wealth management", "credit analysis"
    ],
    "Marketing and Advertising": [
        "marketing", "advertising", "branding", "market research", "digital marketing", "SEO", 
        "content marketing", "social media", "campaign management", "brand strategy", "public relations", 
        "email marketing", "PPC", "Google Analytics", "Google Ads", "Facebook Ads", "influencer marketing", 
        "customer segmentation", "lead generation", "conversion rate optimization", "copywriting", "graphic design"
    ],
    "Human Resources": [
        "human resources", "HR", "recruitment", "employee engagement", "talent acquisition", "onboarding", 
        "compensation", "benefits", "performance management", "training", "development", "labor law", 
        "employee relations", "HRIS", "succession planning", "diversity and inclusion", "payroll", 
        "conflict resolution", "organizational development", "workforce planning"
    ],
    "Healthcare and Medicine": [
        "healthcare", "medicine", "patient care", "clinical", "medical research", "nursing", "pharmaceutical", 
        "public health", "healthcare administration", "EMR", "medical devices", "diagnostics", "pharmacy", 
        "clinical trials", "epidemiology", "biostatistics", "patient safety", "healthcare policy", 
        "medical coding", "radiology", "oncology", "surgery", "disease prevention"
    ],
    "Education and Training": [
        "education", "teaching", "curriculum development", "instructional design", "lesson planning", 
        "assessment", "K-12", "higher education", "e-learning", "training", "teacher", "professor", 
        "academic", "classroom management", "distance learning", "STEM education", "adult education", 
        "educational psychology", "school administration", "learning management systems", "special education", 
        "student engagement"
    ],
    "Manufacturing and Production": [
        "manufacturing", "production", "lean manufacturing", "Six Sigma", "supply chain", "logistics", 
        "inventory management", "quality control", "operations management", "assembly line", "product design", 
        "process improvement", "CAD", "ISO standards", "maintenance", "automation", "plant operations", 
        "warehouse management", "procurement", "factory", "scheduling", "safety regulations"
    ],
    "Sales": [
        "sales", "business development", "customer relationship management", "cold calling", "prospecting", 
        "lead generation", "account management", "B2B sales", "B2C sales", "sales forecasting", "pipeline management", 
        "sales strategy", "territory management", "closing deals", "negotiation", "quota attainment", "upselling", 
        "cross-selling", "retail", "client management"
    ],
    "Legal": [
        "law", "legal research", "contract law", "corporate law", "intellectual property", "litigation", 
        "compliance", "jurisprudence", "case management", "legal writing", "criminal law", "real estate law", 
        "tax law", "employment law", "regulations", "due diligence", "legal documentation", "court", "paralegal", 
        "attorney", "in-house counsel"
    ],
    "Engineering": [
        "engineering", "mechanical engineering", "electrical engineering", "civil engineering", "chemical engineering", 
        "aerospace", "automotive", "design", "CAD", "circuit design", "manufacturing", "construction", "robotics", 
        "automation", "R&D", "product development", "energy", "testing", "project management", "sustainability", 
        "renewable energy"
    ],
    "Project Management": [
        "project management", "PMO", "stakeholder management", "budgeting", "resource allocation", 
        "risk management", "scheduling", "Gantt charts", "project planning", "task management", "scrum", 
        "Agile", "waterfall", "project lifecycle", "milestones", "status reporting", "scope management", 
        "deliverables", "project execution", "PMP", "CAPM"
    ],
    "Hospitality and Tourism": [
        "hospitality", "tourism", "customer service", "event planning", "hotel management", "guest relations", 
        "travel", "reservation management", "housekeeping", "front desk", "catering", "food and beverage", 
        "tour operator", "hospitality management", "event coordination", "hospitality sales", "banquet", 
        "restaurant management", "cruise lines", "lodging"
    ],
    "Supply Chain and Logistics": [
        "supply chain", "logistics", "inventory management", "warehouse management", "procurement", "transportation", 
        "distribution", "supply chain management", "freight", "logistics coordination", "demand planning", 
        "order fulfillment", "shipping", "sourcing", "vendor management", "production planning", "global logistics", 
        "warehousing", "import/export", "supply chain optimization"
    ],
    "Real Estate": [
        "real estate", "property management", "residential", "commercial real estate", "appraisal", 
        "real estate development", "leasing", "mortgage", "brokerage", "valuation", "property sales", 
        "real estate investment", "landlord", "tenant", "property maintenance", "real estate law", 
        "zoning", "market analysis", "realtor"
    ],
    "Construction": [
        "construction", "project management", "architecture", "civil engineering", "building", 
        "construction management", "contractor", "safety regulations", "zoning", "blueprints", 
        "bidding", "project planning", "residential construction", "commercial construction", 
        "materials management", "labor management", "infrastructure", "sustainable construction", 
        "HVAC", "construction site"
    ]
}



In [11]:

# Process resumes and save to CSV
process_resumes(directory, keywords, required_education, industry_keywords, output_csv)
print(f"Resume information has been saved to {output_csv}.")


Resume information has been saved to resume_results.csv.


In [13]:
def process_resumes_with_json(directory, keywords, required_education, industry_keywords, output_csv, output_json):
    results = []
    education_hierarchy = {
        "No Bachelor's Degree": 0,
        "Bachelor's": 1,
        "Master's": 2,
        "Doctorate": 3
    }

    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            raw_text = extract_text_from_pdf(pdf_path)
            processed_text = preprocess_text(raw_text)

            # Extract education level and keyword counts
            education_level = find_education_level(processed_text)
            keyword_counts = extract_keywords(processed_text, keywords)
            
            # Determine probable industry
            probable_industry = match_industry(processed_text, industry_keywords)

            # Check if the candidate meets the education requirement
            education_status = (
                "Meets requirement"
                if education_hierarchy[education_level] >= education_hierarchy[required_education]
                else "Does not meet requirement"
            )

            # Add data to results list
            resume_data = {
                "Filename": filename,
                "Education Level": education_level,
                "Education Status": education_status,
                "Industry": probable_industry,
                "Keywords": keyword_counts
            }
            results.append(resume_data)

    # Write results to CSV
    with open(output_csv, mode='w', newline='') as csvfile:
        fieldnames = ["Filename", "Education Level", "Education Status", "Industry"] + keywords
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in results:
            # Flatten keywords for CSV
            csv_row = {
                "Filename": row["Filename"],
                "Education Level": row["Education Level"],
                "Education Status": row["Education Status"],
                "Industry": row["Industry"],
                **row["Keywords"]
            }
            writer.writerow(csv_row)

    # Write results to JSON
    with open(output_json, 'w') as jsonfile:
        json.dump(results, jsonfile, indent=4)

    print(f"Resume information has been saved to {output_csv} and {output_json}.")

# Parameters
directory = "res"  # Replace with your directory path
keywords = ['python', 'data', 'data structures', 'machine learning', 'data analysis', 'neural networks']
required_education = "Bachelor's"  # Set the required education level
output_csv = "resume_results.csv"
output_json = "resume_results.json"

# Process resumes and save to CSV and JSON
process_resumes_with_json(directory, keywords, required_education, industry_keywords, output_csv, output_json)

Resume information has been saved to resume_results.csv and resume_results.json.
