In [3]:
import os,sys
import re
import spacy
from pdfminer.high_level import extract_text
from docx import Document
from dateutil import parser as dateparser
from pprint import pprint

# Initialize spaCy once
NLP = spacy.load("en_core_web_sm")

In [5]:
# You should expand these lists for production!
SKILLS_LIST = [
    'Python', 'SQL', 'PySpark', 'Shell', 'R', 'NLTK', 'TensorFlow', 'Pandas', 'Scikit-Learn', 'NumPy',
    'TFDV', 'PyTorch', 'Airflow', 'ML Flow', 'statsmodels', 'Dask', 'pydantic', 'DASH', 'AWS',
    'Azure', 'GCP', 'Snowflake', 'Apache Spark', 'Hadoop', 'dbt', 'Talend', 'Informatica', 'SSIS',
    'TIDAL', 'Oracle', 'SQL Server', 'PostgreSQL', 'MySQL', 'Teradata', 'MongoDB', 'Cosmos DB',
    'NoSQL', 'Apache Kafka', 'Apache Flink', 'Docker', 'Kubernetes', 'Terraform', 'GitHub Actions',
    'CI/CD', 'Power BI', 'Tableau', 'EDA', 'Statistical Modeling', 'Trend Analysis', 'matplotlib',
    'seaborn', 'Plotly', 'Agile-Scrum', 'Kanban', 'Data Modelling', 'Data Warehousing', 'GDPR/HIPAA compliance',
    'OpenAI embeddings', 'ChromaDB', 'RAG pipelines', 'Supervised & Unsupervised Learning', 'Feature Engineering', 'Model Evaluation metrics'
]
DEGREE_KEYWORDS = [
    'bachelor', 'master', 'doctor', 'phd', 'msc', 'bachelors', 'masters', 'engineering', 'm.tech', 'b.tech'
]
CERT_KEYWORDS = ['certification', 'certificate', 'certified', 'certifications', 'licenses']
SECTION_HEADERS = {
    'education': ['education', 'academic background', 'academics'],
    'experience': ['professional experience', 'work experience', 'employment', 'experience'],
    'skills': ['skills', 'technical skills', 'key skills'],
    'certifications': ['certifications', 'certificates', 'licenses'],
    'projects': ['projects', 'key projects', 'personal projects']
}


In [2]:
def pdf_to_text(pdf_path):
    return extract_text(pdf_path)

def docx_to_text(docx_path):
    doc = Document(docx_path)
    return '\n'.join([para.text for para in doc.paragraphs])

def extract_text_from_file(path):
    if path.endswith('.pdf'):
        return pdf_to_text(path)
    elif path.endswith('.docx'):
        return docx_to_text(path)
    else:
        with open(path, 'r', encoding='utf-8') as f:
            return f.read()

In [13]:
def extract_sections(text):
    lines = text.split('\n')
    section_map = {}
    current_section = None
    buffer = []

    def header_key(line):
        line_clean = line.strip().lower()
        for key, variants in SECTION_HEADERS.items():
            if any(line_clean.startswith(h) for h in variants):
                return key
        return None

    for line in lines:
        section = header_key(line)
        if section:
            if current_section and buffer:
                section_map[current_section] = '\n'.join(buffer).strip()
                buffer = []
            current_section = section
        elif current_section:
            buffer.append(line)
    # Capture last section
    if current_section and buffer:
        section_map[current_section] = '\n'.join(buffer).strip()
    return section_map

def extract_skills(skills_text):
    skills_found = set()
    text_lower = skills_text.lower()
    '''
    for skill in SKILLS_LIST:
        if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text_lower):
            skills_found.add(skill)
    '''
    # Add anything in a comma/list line in skills section
    for line in skills_text.split('\n'):
        if ',' in line:
            for word in line.split(','):
                word_clean = word.strip()
                if word_clean and word_clean not in skills_found:
                    skills_found.add(word_clean)
    return list(skills_found)

In [9]:
txt=extract_text_from_file(r"C:\Users\hrith\Projects\Screening_Agent\data\resumes\Hrithik_Resume.pdf")
sections = extract_sections(txt)

In [15]:
sections

{'skills': '• \n• \n• \n• \n• \n\nProgramming & Scripting: Python, SQL,\u202fPySpark, Shell, R. \n\nPython Libraries: NLTK, TensorFlow, Pandas, Scikit-Learn, NumPy, TFDV, PyTorch, Airflow, ML Flow, statsmodels, Dask, pydantic, DASH \n\nCloud & Warehousing: AWS\u202f(S3, Glue, Redshift, Lambda,\u202fKinesis, EMR), Azure\u202f(Synapse, ADLS,\u202fDatabricks, Data\u202fFactory), GCP\u202f(BigQuery, VertexAI), Snowflake. \n\nData Processing & Orchestration: Apache\u202fSpark, Hadoop, dbt, Airflow, Talend, Informatica, SSIS, TIDAL. \n\nDatabases: Oracle, SQL\u202fServer, PostgreSQL, MySQL, Teradata, MongoDB, Cosmos\u202fDB, NoSQL. \n\n• \n\nStreaming & Real‑Time Analytics: Apache\u202fKafka, Apache\u202fFlink, AWS\u202fKinesis. \n\nContainerization & DevOps: Docker, Kubernetes, Terraform, GitHub\u202fActions, CI/CD. \n\nVisualization & BI: Power BI, Tableau, EDA, Statistical Modeling, Trend Analysis, matplotlib, seaborn, Plotly \n\n• \n• \n• \n•  Methodologies & Governance: Agile‑Scrum, Kan

In [14]:
extract_skills(sections['skills'])

['Lambda',
 'R.',
 'EMR)',
 'SQL\u202fServer',
 'EDA',
 'Apache\u202fFlink',
 'NumPy',
 'CI/CD.',
 'ML Flow',
 'seaborn',
 'Informatica',
 'MySQL',
 'DASH',
 'ChromaDB',
 'Python Libraries: NLTK',
 'TIDAL.',
 'Data Processing & Orchestration: Apache\u202fSpark',
 'PySpark',
 'statsmodels',
 'Statistical Modeling',
 'Cloud & Warehousing: AWS\u202f(S3',
 'Trend Analysis',
 'Dask',
 'Kanban',
 'matplotlib',
 'TensorFlow',
 'GitHub\u202fActions',
 'Visualization & BI: Power BI',
 'RAG pipelines',
 'VertexAI)',
 'Talend',
 'GCP\u202f(BigQuery',
 'Hadoop',
 'dbt',
 'Databases: Oracle',
 'Cosmos\u202fDB',
 'Kubernetes',
 'Shell',
 'Databricks',
 'Kinesis',
 'Teradata',
 'Airflow',
 'Azure\u202f(Synapse',
 'Supervised & Unsupervised Learning',
 'Plotly',
 'TFDV',
 'SQL',
 'Snowflake.',
 'Feature Engineering',
 'Scikit-Learn',
 'Terraform',
 'GenAI & Machine Learning: OpenAI embeddings',
 'pydantic',
 'Glue',
 'Data\u202fFactory)',
 'Programming & Scripting: Python',
 'AWS\u202fKinesis.',
 'Str

In [None]:
def nlp_resume_parse(resume_path):
    text = extract_text_from_file(resume_path)
    sections = extract_sections(text)

    name = extract_name(text)
    email = extract_email(text)
    phone = extract_phone(text)
    skills = extract_skills(sections.get('skills', '')) if 'skills' in sections else []
    education = extract_education(sections.get('education', '')) if 'education' in sections else []
    certifications = extract_certifications(sections.get('certifications', '')) if 'certifications' in sections else []
    experience = extract_experience(sections.get('experience', '')) if 'experience' in sections else []
    projects = []  # Implement if needed: extract_projects(sections.get('projects', ''))
    soft_skills = []  # Optional: can do keyword match or use spaCy's NER
    other_notes = []

    out = {
        'name': name,
        'email': email,
        'phone': phone,
        'skills': skills,
        'education': education,
        'certifications': certifications,
        'past_roles': experience,
        'projects': projects,
        'soft_skills': soft_skills,
        'other_notes': other_notes,
        'total_years_of_experience': calc_total_exp(experience)
    }
    return out

In [None]:
from pdfminer.high_level import extract_text
def pdf_to_text(pdf_path):
    return extract_text(pdf_path)
text = pdf_to_text("C:\Users\hrith\Projects\Screening_Agent\data\\resumes\Hrithik_Resume.pdf")



FileNotFoundError: [Errno 2] No such file or directory: 'data/resumes/Hrithik_Resume.pdf'

In [None]:
for iine in text.split('\n')