In [1]:
# Install all required packages
!apt-get update
!apt-get install -y tesseract-ocr
!pip install pdfminer.six pytesseract pdf2image PyMuPDF spacy fuzzywuzzy python-levenshtein
!python -m spacy download en_core_web_sm

# Import all libraries
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
from pdfminer.high_level import extract_text
import spacy
import re
import json
import warnings
from fuzzywuzzy import fuzz
from collections import Counter
import numpy as np

# Load spaCy model
nlp = spacy.load("en_core_web_sm")
warnings.filterwarnings('ignore')

print("‚úÖ All dependencies installed and imported successfully!")


0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to cloud.r-project.or                                                                               Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                               Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [2 InRelease 46.0 kB/128 kB 36%] [3 InRelease 60.5 kB/129 kB 47%] [Connected0% [2 InRelease 128 kB/128 kB 100%] [Connected to cloud.r-project.org (65.9.86.0% [Connected to cloud.r-project.org (65.9.86.109)] [Connecting to r2u.stat.ill                                                                               Get:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
0% [4 InRelease 47.5 kB/127 kB 37%] [Connected to cloud.r-project.org (65.9.86.0% [Waiting for headers] [Connecting to r2u.stat.illin

In [2]:
def extract_text_from_resume(file_path):
    """
    Extracts text from PDF resumes. Automatically detects if PDF is
    scanned/image-based and applies appropriate extraction method.
    """
    print(f"üìÑ Processing: {file_path}")

    def is_scanned_pdf(pdf_path):
        """Check if PDF is scanned by analyzing text content ratio"""
        try:
            # First try native text extraction
            doc = fitz.open(pdf_path)
            total_text = ""
            total_images = 0

            for page in doc:
                total_text += page.get_text()
                total_images += len(page.get_images())

            doc.close()

            # If very little text but images exist, likely scanned
            text_length = len(total_text.strip())
            is_scanned = (text_length < 100 and total_images > 0) or text_length < 50

            print(f"   üìä Text length: {text_length}, Images: {total_images}")
            print(f"   üîç PDF Type: {'SCANNED' if is_scanned else 'DIGITAL'}")

            return is_scanned, total_text

        except Exception as e:
            print(f"   ‚ö†Ô∏è Error analyzing PDF: {str(e)}")
            return True, ""  # Default to OCR if uncertain

    def extract_with_ocr(pdf_path):
        """Extract text using OCR for scanned PDFs"""
        print("   üîÑ Applying OCR (Tesseract)...")
        try:
            # Convert PDF to images
            pages = convert_from_path(pdf_path, dpi=300)
            extracted_text = ""

            for i, page in enumerate(pages):
                print(f"      Processing page {i+1}/{len(pages)}")
                # OCR configuration for better accuracy
                custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,@+-()[]{}|/:;!?"\' '
                page_text = pytesseract.image_to_string(page, config=custom_config)
                extracted_text += page_text + "\n"

            return extracted_text

        except Exception as e:
            print(f"   ‚ùå OCR failed: {str(e)}")
            return ""

    def extract_digital_text(pdf_path):
        """Extract text from digital PDFs"""
        print("   üìñ Extracting digital text...")
        try:
            # Try PyMuPDF first
            text = ""
            doc = fitz.open(pdf_path)
            for page in doc:
                text += page.get_text()
            doc.close()

            # Fallback to pdfminer if PyMuPDF gives little content
            if len(text.strip()) < 100:
                print("   üîÑ Fallback to pdfminer...")
                text = extract_text(pdf_path)

            return text

        except Exception as e:
            print(f"   ‚ùå Digital extraction failed: {str(e)}")
            return ""

    # Main extraction logic
    try:
        is_scanned, initial_text = is_scanned_pdf(file_path)

        if is_scanned:
            final_text = extract_with_ocr(file_path)
        else:
            final_text = initial_text if initial_text else extract_digital_text(file_path)

        if len(final_text.strip()) < 50:
            print("   ‚ö†Ô∏è Low text yield, attempting OCR fallback...")
            final_text = extract_with_ocr(file_path)

        print(f"‚úÖ Extraction complete: {len(final_text)} characters")
        return final_text

    except Exception as e:
        print(f"‚ùå Extraction failed: {str(e)}")
        return ""

print("‚úÖ Text extraction function defined!")


‚úÖ Text extraction function defined!


In [3]:
def clean_and_normalize_text(raw_text):
    """
    Cleans and normalizes extracted text for better parsing.
    Removes headers, footers, excessive whitespace, and special characters.
    """
    print("üßπ Cleaning and normalizing text...")

    if not raw_text:
        return ""

    text = raw_text

    # Remove common headers/footers patterns
    header_footer_patterns = [
        r'Page \d+ of \d+',
        r'www\.[\w\-\.]+\.com',
        r'¬©.*?\d{4}',
        r'Confidential.*',
        r'Resume.*Page.*\d+',
    ]

    for pattern in header_footer_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Normalize whitespace and line breaks
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple line breaks to double
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs to single space
    text = re.sub(r'\n ', '\n', text)  # Remove spaces at line starts

    # Remove excessive special characters but keep important ones
    text = re.sub(r'[^\w\s@.+\-():,/\\|&%#\n]', '', text)

    # Remove very short lines that are likely artifacts
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if len(line) > 1:  # Keep lines with more than 1 character
            cleaned_lines.append(line)

    cleaned_text = '\n'.join(cleaned_lines)

    print(f"   ‚úÖ Cleaned text length: {len(cleaned_text)} chars")
    return cleaned_text

print("‚úÖ Text cleaning function defined!")


‚úÖ Text cleaning function defined!


In [4]:
def extract_skills(text):
    """
    COMPREHENSIVE skills extraction that handles parentheses, special characters,
    and complex formatting like 'SQL(Oracle, MySQL)', 'MongoDB(Basic)' etc.
    """
    print("üõ†Ô∏è Extracting skills with ADVANCED parsing...")

    # COMPREHENSIVE Technical Skills Database
    TECHNICAL_SKILLS = [
        # Programming Languages
        'C++', 'C#', 'C', 'Python', 'Java', 'JavaScript', 'TypeScript', 'PHP', 'Ruby',
        'Swift', 'Kotlin', 'Go', 'Rust', 'Scala', 'R', 'MATLAB', 'Dart', 'Perl', 'SQL',

        # Web Technologies
        'HTML', 'CSS', 'React', 'Angular', 'Vue', 'Node.js', 'Express', 'Django', 'Flask',
        'Spring', 'Bootstrap', 'Tailwind CSS', 'jQuery', 'SASS', 'LESS', 'REST API', 'GraphQL',

        # Databases & Database Technologies
        'MySQL', 'PostgreSQL', 'MongoDB', 'Redis', 'Oracle', 'SQLite', 'NoSQL', 'Cassandra',
        'Firebase', 'DynamoDB', 'MariaDB', 'Neo4j', 'Elasticsearch', 'SQL Server',

        # Data Science & ML
        'Scikit-learn', 'Pandas', 'NumPy', 'Matplotlib', 'Seaborn', 'TensorFlow', 'PyTorch',
        'Keras', 'OpenCV', 'NLTK', 'Spark', 'Hadoop',

        # Development Tools & Version Control
        'Git', 'GitHub', 'GitLab', 'Bitbucket', 'SVN', 'VS Code', 'Visual Studio Code',
        'PyCharm', 'IntelliJ', 'Eclipse', 'Android Studio', 'Xcode',

        # IDEs & Editors
        'Jupyter', 'Jupyter Notebook', 'Google Colab', 'Anaconda', 'Atom', 'Sublime Text',

        # Cloud Platforms
        'AWS', 'Azure', 'Google Cloud', 'GCP', 'Docker', 'Kubernetes', 'Jenkins',

        # Mobile Development
        'Android', 'iOS', 'React Native', 'Flutter', 'Xamarin', 'Ionic',

        # Operating Systems
        'Linux', 'Windows', 'MacOS', 'Unix', 'Ubuntu',

        # Libraries & Frameworks
        'React.js', 'Angular.js', 'Vue.js', 'Express.js', 'Next.js', 'Nuxt.js'
    ]

    # Skill variations and aliases
    SKILL_VARIANTS = {
        'C++': ['C plus plus', 'Cpp', 'CPP', 'c++'],
        'C#': ['C sharp', 'CSharp', 'c#'],
        'JavaScript': ['JS', 'Javascript', 'ECMAScript', 'js'],
        'React': ['React.js', 'ReactJS'],
        'Node.js': ['NodeJS', 'Node'],
        'Tailwind CSS': ['Tailwind', 'TailwindCSS'],
        'Visual Studio Code': ['VS Code', 'VSCode'],
        'Jupyter Notebook': ['Jupyter'],
        'GitHub': ['Github'],
        'MongoDB': ['Mongo'],
        'PostgreSQL': ['Postgres'],
        'Scikit-learn': ['sklearn', 'scikit learn']
    }

    def preprocess_skills_text(text):
        """
        Preprocess text to extract skills from complex formats like:
        'SQL(Oracle, MySQL)', 'MongoDB(Basic)', 'React.js (Library)'
        """
        processed_text = text

        # Extract content from parentheses and add as separate items
        # Pattern: Word(content) -> Word, content
        parentheses_pattern = r'(\w+)\s*\(([^)]+)\)'
        matches = re.findall(parentheses_pattern, processed_text)

        for main_item, inside_content in matches:
            # Add the main item
            processed_text += f" {main_item} "

            # Add items inside parentheses (split by comma)
            inside_items = [item.strip() for item in inside_content.split(',')]
            for item in inside_items:
                # Clean item (remove words like 'Basic', 'Library', 'Framework')
                clean_item = re.sub(r'\b(Basic|Library|Framework|Advanced|Intermediate)\b', '', item, flags=re.IGNORECASE).strip()
                if clean_item and len(clean_item) > 1:
                    processed_text += f" {clean_item} "

        return processed_text

    def extract_skills_with_advanced_matching(text):
        """Extract skills using multiple strategies"""
        found_skills = set()

        # Preprocess text to handle parentheses
        processed_text = preprocess_skills_text(text)
        text_for_search = ' ' + processed_text.lower() + ' '

        for skill in TECHNICAL_SKILLS:
            skill_found = False

            # Strategy 1: Exact match with word boundaries
            skill_lower = skill.lower()

            # Handle special characters in skill names
            if any(char in skill for char in ['+', '#', '.', '-']):
                # For C++, C#, React.js etc.
                escaped_skill = re.escape(skill_lower)
                patterns_to_try = [
                    r'\b' + escaped_skill + r'\b',
                    r'(?<!\w)' + escaped_skill + r'(?!\w)',  # Alternative boundary
                    escaped_skill  # Fallback without boundaries
                ]

                for pattern in patterns_to_try:
                    if re.search(pattern, text_for_search):
                        found_skills.add(skill)
                        skill_found = True
                        break
            else:
                # Standard word boundary matching
                pattern = r'\b' + re.escape(skill_lower) + r'\b'
                if re.search(pattern, text_for_search):
                    found_skills.add(skill)
                    skill_found = True

            # Strategy 2: Check variants if main skill not found
            if not skill_found and skill in SKILL_VARIANTS:
                for variant in SKILL_VARIANTS[skill]:
                    variant_lower = variant.lower()
                    if any(char in variant for char in ['+', '#', '.', '-']):
                        escaped_variant = re.escape(variant_lower)
                        variant_patterns = [
                            r'\b' + escaped_variant + r'\b',
                            r'(?<!\w)' + escaped_variant + r'(?!\w)',
                            escaped_variant
                        ]

                        for vpattern in variant_patterns:
                            if re.search(vpattern, text_for_search):
                                found_skills.add(skill)
                                skill_found = True
                                break
                    else:
                        variant_pattern = r'\b' + re.escape(variant_lower) + r'\b'
                        if re.search(variant_pattern, text_for_search):
                            found_skills.add(skill)
                            skill_found = True
                            break

                    if skill_found:
                        break

        return found_skills

    def locate_skills_sections(text):
        """Find all skills-related sections"""
        section_patterns = [
            r'(?i)skills?\s*:?',
            r'(?i)languages?\s*:',
            r'(?i)database\s*:',
            r'(?i)dev[-\s]?tools?\s*:',
            r'(?i)technologies?\s*:',
            r'(?i)libraries?/?frameworks?\s*:',
            r'(?i)frameworks?\s*:',
            r'(?i)tools?\s*:',
            r'(?i)programming\s+languages?\s*:',
            r'(?i)technical\s+skills?\s*:'
        ]

        skills_content = []

        for pattern in section_patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                start_pos = match.end()

                # Find end of section (next section header or double newline)
                end_patterns = [
                    r'(?i)\n\s*(?:experience|education|projects?|certifications?|contact)',
                    r'\n\s*[A-Z][A-Z\s]+:',  # Next all-caps header
                    r'\n\n\n+',  # Multiple newlines
                    r'\n\s*\n\s*[A-Z]'  # Double newline followed by capital letter
                ]

                end_pos = len(text)
                for end_pattern in end_patterns:
                    end_match = re.search(end_pattern, text[start_pos:])
                    if end_match:
                        potential_end = start_pos + end_match.start()
                        if potential_end > start_pos + 20:  # Ensure reasonable section length
                            end_pos = potential_end
                            break

                section_text = text[start_pos:end_pos].strip()
                if len(section_text) > 5:
                    skills_content.append(section_text)
                    print(f"   üìç Found section: {section_text[:50]}...")

        return ' '.join(skills_content) if skills_content else text

    # Main extraction process
    print("   üéØ Locating all skills sections...")
    skills_text = locate_skills_sections(text)

    print("   üîç Applying advanced skills matching...")
    print(f"   üìù Processing text: {skills_text[:200]}...")

    found_skills = extract_skills_with_advanced_matching(skills_text)

    # If few skills found, try entire document
    if len(found_skills) < 3:
        print("   üîÑ Expanding search to full document...")
        found_skills.update(extract_skills_with_advanced_matching(text))

    # Sort and return
    final_skills = sorted(list(found_skills))

    print(f"   ‚úÖ Extracted {len(final_skills)} skills: {final_skills}")

    return final_skills


In [None]:
def extract_name(text):
    """
    Extract full name from resume text using multiple strategies
    """
    print("üë§ Extracting name...")
    
    # Strategy 1: Look for name patterns at the beginning
    lines = text.split('\n')
    
    # Common name indicators
    name_indicators = [
        r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)',  # First line capitalized words
        r'Name\s*:?\s*([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)',
        r'([A-Z][A-Z\s]+)',  # All caps name
    ]
    
    # Check first few lines for name patterns
    for i, line in enumerate(lines[:5]):
        line = line.strip()
        if len(line) < 3 or len(line) > 50:  # Skip very short or long lines
            continue
            
        # Skip lines with common resume keywords
        skip_keywords = ['resume', 'cv', 'curriculum', 'vitae', 'phone', 'email', 'address', 'objective']
        if any(keyword in line.lower() for keyword in skip_keywords):
            continue
            
        for pattern in name_indicators:
            match = re.search(pattern, line, re.IGNORECASE)
            if match:
                name = match.group(1).strip()
                # Validate name (2-4 words, each starting with capital)
                name_parts = name.split()
                if 2 <= len(name_parts) <= 4 and all(part[0].isupper() for part in name_parts):
                    print(f"   ‚úÖ Found name: {name}")
                    return name
    
    # Strategy 2: Use spaCy NER for person names
    try:
        doc = nlp(text[:500])  # Check first 500 chars
        for ent in doc.ents:
            if ent.label_ == "PERSON" and len(ent.text.split()) >= 2:
                name = ent.text.strip()
                print(f"   ‚úÖ Found name via NER: {name}")
                return name
    except:
        pass
    
    print("   ‚ö†Ô∏è Name not found")
    return ""

def extract_email(text):
    """
    Extract email address from resume text
    """
    print("üìß Extracting email...")
    
    # Email regex pattern
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    
    emails = re.findall(email_pattern, text)
    
    if emails:
        # Return the first valid email
        email = emails[0]
        print(f"   ‚úÖ Found email: {email}")
        return email
    
    print("   ‚ö†Ô∏è Email not found")
    return ""

def extract_mobile(text):
    """
    Extract mobile/phone number from resume text
    """
    print("üì± Extracting mobile number...")
    
    # Phone number patterns (various formats)
    phone_patterns = [
        r'\+\d{1,3}[-.\s]?\d{10}',  # +91-9876543210
        r'\+\d{1,3}\s?\(\d{3}\)\s?\d{3}[-.\s]?\d{4}',  # +1 (555) 123-4567
        r'\d{10}',  # 9876543210
        r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',  # 987-654-3210
        r'\(\d{3}\)\s?\d{3}[-.\s]?\d{4}',  # (987) 654-3210
    ]
    
    for pattern in phone_patterns:
        matches = re.findall(pattern, text)
        if matches:
            # Clean and format the phone number
            phone = matches[0]
            # Remove special characters except +
            phone = re.sub(r'[^\d+]', '', phone)
            
            # Validate length (should be 10-15 digits)
            digits_only = re.sub(r'[^\d]', '', phone)
            if 10 <= len(digits_only) <= 15:
                print(f"   ‚úÖ Found mobile: {phone}")
                return phone
    
    print("   ‚ö†Ô∏è Mobile number not found")
    return ""

def extract_linkedin(text):
    """
    Extract LinkedIn profile URL
    """
    print("üîó Extracting LinkedIn profile...")
    
    linkedin_patterns = [
        r'linkedin\.com/in/[\w\-]+',
        r'www\.linkedin\.com/in/[\w\-]+',
        r'https?://(?:www\.)?linkedin\.com/in/[\w\-]+',
    ]
    
    for pattern in linkedin_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            linkedin = matches[0]
            if not linkedin.startswith('http'):
                linkedin = 'https://' + linkedin
            print(f"   ‚úÖ Found LinkedIn: {linkedin}")
            return linkedin
    
    print("   ‚ö†Ô∏è LinkedIn not found")
    return ""

def extract_github(text):
    """
    Extract GitHub profile URL
    """
    print("üêô Extracting GitHub profile...")
    
    github_patterns = [
        r'github\.com/[\w\-]+',
        r'www\.github\.com/[\w\-]+',
        r'https?://(?:www\.)?github\.com/[\w\-]+',
    ]
    
    for pattern in github_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            github = matches[0]
            if not github.startswith('http'):
                github = 'https://' + github
            print(f"   ‚úÖ Found GitHub: {github}")
            return github
    
    print("   ‚ö†Ô∏è GitHub not found")
    return ""

def extract_portfolio(text):
    """
    Extract portfolio/personal website URL
    """
    print("üåê Extracting portfolio website...")
    
    # Look for personal websites (excluding common platforms)
    url_pattern = r'https?://(?:www\.)?[\w\-]+\.[\w\-]+(?:/[\w\-]*)*'
    urls = re.findall(url_pattern, text, re.IGNORECASE)
    
    # Filter out common platforms
    excluded_domains = ['linkedin.com', 'github.com', 'gmail.com', 'yahoo.com', 'outlook.com']
    
    for url in urls:
        if not any(domain in url.lower() for domain in excluded_domains):
            print(f"   ‚úÖ Found portfolio: {url}")
            return url
    
    print("   ‚ö†Ô∏è Portfolio not found")
    return ""

print("‚úÖ All extraction functions defined!")

In [None]:
def extract_education(text):
    """
    Extract education information from resume text
    """
    print("üéì Extracting education...")
    
    education_list = []
    
    # Education section patterns
    education_patterns = [
        r'(?i)education\s*:?',
        r'(?i)academic\s+background\s*:?',
        r'(?i)qualifications?\s*:?',
        r'(?i)degrees?\s*:?'
    ]
    
    # Find education section
    education_section = ""
    for pattern in education_patterns:
        match = re.search(pattern, text)
        if match:
            start_pos = match.end()
            # Find end of section
            end_patterns = [
                r'(?i)\n\s*(?:experience|work|employment|skills|projects?|certifications?)',
                r'\n\s*[A-Z][A-Z\s]+:',
                r'\n\n\n+'
            ]
            
            end_pos = len(text)
            for end_pattern in end_patterns:
                end_match = re.search(end_pattern, text[start_pos:])
                if end_match:
                    end_pos = start_pos + end_match.start()
                    break
            
            education_section = text[start_pos:end_pos].strip()
            break
    
    if not education_section:
        education_section = text  # Use full text if no section found
    
    # Degree patterns
    degree_patterns = [
        r'(B\.?Tech|Bachelor of Technology|BTech)\s+(?:in\s+)?([^,\n]+)',
        r'(B\.?E\.?|Bachelor of Engineering)\s+(?:in\s+)?([^,\n]+)',
        r'(M\.?Tech|Master of Technology|MTech)\s+(?:in\s+)?([^,\n]+)',
        r'(M\.?S\.?|Master of Science)\s+(?:in\s+)?([^,\n]+)',
        r'(MBA|Master of Business Administration)',
        r'(Ph\.?D\.?|Doctor of Philosophy)\s+(?:in\s+)?([^,\n]+)',
        r'(B\.?Sc\.?|Bachelor of Science)\s+(?:in\s+)?([^,\n]+)',
        r'(M\.?Sc\.?|Master of Science)\s+(?:in\s+)?([^,\n]+)',
    ]
    
    # Extract degrees
    for pattern in degree_patterns:
        matches = re.findall(pattern, education_section, re.IGNORECASE)
        for match in matches:
            if isinstance(match, tuple):
                degree = match[0]
                field = match[1] if len(match) > 1 and match[1] else ""
            else:
                degree = match
                field = ""
            
            # Look for institution and year nearby
            degree_context = education_section
            institution = ""
            year = ""
            grade = ""
            
            # Institution patterns
            inst_patterns = [
                r'(?:from\s+|at\s+)?([A-Z][^,\n]+(?:University|Institute|College|School))',
                r'([A-Z][^,\n]+(?:University|Institute|College|School))',
            ]
            
            for inst_pattern in inst_patterns:
                inst_match = re.search(inst_pattern, degree_context, re.IGNORECASE)
                if inst_match:
                    institution = inst_match.group(1).strip()
                    break
            
            # Year patterns
            year_patterns = [
                r'(20\d{2})',
                r'(19\d{2})',
                r'(\d{4})'
            ]
            
            for year_pattern in year_patterns:
                year_match = re.search(year_pattern, degree_context)
                if year_match:
                    year = year_match.group(1)
                    break
            
            # Grade patterns
            grade_patterns = [
                r'(\d+\.?\d*\s*(?:CGPA|GPA|cgpa|gpa))',
                r'(\d+\.?\d*%)',
                r'(\d+\.?\d*/10)',
                r'(\d+\.?\d*/4\.0)'
            ]
            
            for grade_pattern in grade_patterns:
                grade_match = re.search(grade_pattern, degree_context)
                if grade_match:
                    grade = grade_match.group(1)
                    break
            
            education_entry = {
                "degree": f"{degree} {field}".strip(),
                "institution": institution,
                "year": year,
                "grade": grade
            }
            
            education_list.append(education_entry)
    
    print(f"   ‚úÖ Found {len(education_list)} education entries")
    return education_list

def extract_experience(text):
    """
    Extract work experience from resume text
    """
    print("üíº Extracting work experience...")
    
    experience_list = []
    
    # Experience section patterns
    exp_patterns = [
        r'(?i)(?:work\s+)?experience\s*:?',
        r'(?i)employment\s+history\s*:?',
        r'(?i)professional\s+experience\s*:?',
        r'(?i)career\s+history\s*:?'
    ]
    
    # Find experience section
    experience_section = ""
    for pattern in exp_patterns:
        match = re.search(pattern, text)
        if match:
            start_pos = match.end()
            # Find end of section
            end_patterns = [
                r'(?i)\n\s*(?:education|skills|projects?|certifications?)',
                r'\n\s*[A-Z][A-Z\s]+:',
                r'\n\n\n+'
            ]
            
            end_pos = len(text)
            for end_pattern in end_patterns:
                end_match = re.search(end_pattern, text[start_pos:])
                if end_match:
                    end_pos = start_pos + end_match.start()
                    break
            
            experience_section = text[start_pos:end_pos].strip()
            break
    
    if not experience_section:
        # Look for company patterns in full text
        experience_section = text
    
    # Company and role patterns
    company_patterns = [
        r'([A-Z][^,\n]+(?:Inc|Ltd|LLC|Corp|Company|Technologies|Systems|Solutions|Pvt))',
        r'([A-Z][a-zA-Z\s&]+)\s*(?:\||,|\n)',
    ]
    
    role_patterns = [
        r'(?:as\s+)?([A-Z][^,\n]+(?:Engineer|Developer|Analyst|Manager|Intern|Consultant))',
        r'Position\s*:\s*([^,\n]+)',
        r'Role\s*:\s*([^,\n]+)',
    ]
    
    # Extract experience entries
    lines = experience_section.split('\n')
    current_entry = {}
    
    for line in lines:
        line = line.strip()
        if len(line) < 3:
            continue
        
        # Check for company
        for pattern in company_patterns:
            match = re.search(pattern, line)
            if match:
                if current_entry:
                    experience_list.append(current_entry)
                current_entry = {"company": match.group(1).strip()}
                break
        
        # Check for role
        for pattern in role_patterns:
            match = re.search(pattern, line)
            if match:
                current_entry["role"] = match.group(1).strip()
                break
        
        # Check for dates
        date_patterns = [
            r'((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4})\s*[-‚Äì]\s*((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}|Present)',
            r'(\d{4})\s*[-‚Äì]\s*(\d{4}|Present)',
            r'(\d{1,2}/\d{4})\s*[-‚Äì]\s*(\d{1,2}/\d{4}|Present)'
        ]
        
        for pattern in date_patterns:
            match = re.search(pattern, line)
            if match:
                current_entry["start_date"] = match.group(1)
                current_entry["end_date"] = match.group(2)
                break
        
        # Collect responsibilities
        if "responsibilities" not in current_entry:
            current_entry["responsibilities"] = []
        
        # Look for bullet points or responsibilities
        if re.match(r'^\s*[‚Ä¢¬∑‚ñ™‚ñ´-]\s*', line) or any(keyword in line.lower() for keyword in ['developed', 'built', 'created', 'managed', 'led', 'implemented']):
            responsibility = re.sub(r'^\s*[‚Ä¢¬∑‚ñ™‚ñ´-]\s*', '', line).strip()
            if responsibility:
                current_entry["responsibilities"].append(responsibility)
    
    if current_entry:
        experience_list.append(current_entry)
    
    print(f"   ‚úÖ Found {len(experience_list)} experience entries")
    return experience_list

def extract_projects(text):
    """
    Extract projects from resume text
    """
    print("üöÄ Extracting projects...")
    
    projects_list = []
    
    # Projects section patterns
    project_patterns = [
        r'(?i)projects?\s*:?',
        r'(?i)personal\s+projects?\s*:?',
        r'(?i)academic\s+projects?\s*:?',
        r'(?i)key\s+projects?\s*:?'
    ]
    
    # Find projects section
    projects_section = ""
    for pattern in project_patterns:
        match = re.search(pattern, text)
        if match:
            start_pos = match.end()
            # Find end of section
            end_patterns = [
                r'(?i)\n\s*(?:experience|education|skills|certifications?)',
                r'\n\s*[A-Z][A-Z\s]+:',
                r'\n\n\n+'
            ]
            
            end_pos = len(text)
            for end_pattern in end_patterns:
                end_match = re.search(end_pattern, text[start_pos:])
                if end_match:
                    end_pos = start_pos + end_match.start()
                    break
            
            projects_section = text[start_pos:end_pos].strip()
            break
    
    if projects_section:
        # Extract project entries
        project_entries = re.split(r'\n\s*(?=[A-Z][^:]+:|\d+\.)', projects_section)
        
        for entry in project_entries:
            if len(entry.strip()) < 10:
                continue
            
            project = {}
            lines = entry.split('\n')
            
            # First line is usually project name
            first_line = lines[0].strip()
            project["name"] = re.sub(r'^\d+\.\s*', '', first_line)
            
            # Look for description and technologies
            description_lines = []
            technologies = []
            
            for line in lines[1:]:
                line = line.strip()
                if not line:
                    continue
                
                # Check for technology indicators
                if any(keyword in line.lower() for keyword in ['technologies', 'tech stack', 'built with', 'using']):
                    # Extract technologies
                    tech_text = re.sub(r'(?i)technologies?\s*:?\s*', '', line)
                    tech_text = re.sub(r'(?i)tech\s+stack\s*:?\s*', '', tech_text)
                    tech_text = re.sub(r'(?i)built\s+with\s*:?\s*', '', tech_text)
                    tech_text = re.sub(r'(?i)using\s*:?\s*', '', tech_text)
                    
                    # Split by common separators
                    techs = re.split(r'[,;|]', tech_text)
                    technologies.extend([tech.strip() for tech in techs if tech.strip()])
                else:
                    description_lines.append(line)
            
            project["description"] = ' '.join(description_lines)
            project["technologies"] = technologies
            
            # Look for links
            link_pattern = r'https?://[\w\-\.]+(?:/[\w\-]*)*'
            links = re.findall(link_pattern, entry)
            project["link"] = links[0] if links else ""
            
            projects_list.append(project)
    
    print(f"   ‚úÖ Found {len(projects_list)} projects")
    return projects_list

def extract_certifications(text):
    """
    Extract certifications from resume text
    """
    print("üèÜ Extracting certifications...")
    
    certifications_list = []
    
    # Certifications section patterns
    cert_patterns = [
        r'(?i)certifications?\s*:?',
        r'(?i)certificates?\s*:?',
        r'(?i)licenses?\s*:?',
        r'(?i)achievements?\s*:?'
    ]
    
    # Find certifications section
    cert_section = ""
    for pattern in cert_patterns:
        match = re.search(pattern, text)
        if match:
            start_pos = match.end()
            # Find end of section
            end_patterns = [
                r'(?i)\n\s*(?:experience|education|skills|projects?)',
                r'\n\s*[A-Z][A-Z\s]+:',
                r'\n\n\n+'
            ]
            
            end_pos = len(text)
            for end_pattern in end_patterns:
                end_match = re.search(end_pattern, text[start_pos:])
                if end_match:
                    end_pos = start_pos + end_match.start()
                    break
            
            cert_section = text[start_pos:end_pos].strip()
            break
    
    if cert_section:
        # Extract certification entries
        lines = cert_section.split('\n')
        
        for line in lines:
            line = line.strip()
            if len(line) < 5:
                continue
            
            # Remove bullet points
            line = re.sub(r'^\s*[‚Ä¢¬∑‚ñ™‚ñ´-]\s*', '', line)
            
            cert = {}
            
            # Look for certification name and issuer
            # Pattern: "Certification Name - Issuer (Year)"
            cert_pattern = r'([^-\(]+)(?:\s*-\s*([^(]+))?(?:\s*\((\d{4})\))?'
            match = re.search(cert_pattern, line)
            
            if match:
                cert["name"] = match.group(1).strip()
                cert["issuer"] = match.group(2).strip() if match.group(2) else ""
                cert["year"] = match.group(3) if match.group(3) else ""
                
                certifications_list.append(cert)
    
    print(f"   ‚úÖ Found {len(certifications_list)} certifications")
    return certifications_list

def extract_summary(text):
    """
    Extract summary/objective from resume text
    """
    print("üìù Extracting summary/objective...")
    
    # Summary section patterns
    summary_patterns = [
        r'(?i)(?:professional\s+)?summary\s*:?',
        r'(?i)objective\s*:?',
        r'(?i)career\s+objective\s*:?',
        r'(?i)profile\s*:?',
        r'(?i)about\s+me\s*:?'
    ]
    
    for pattern in summary_patterns:
        match = re.search(pattern, text)
        if match:
            start_pos = match.end()
            # Find end of section
            end_patterns = [
                r'(?i)\n\s*(?:experience|education|skills|projects?)',
                r'\n\s*[A-Z][A-Z\s]+:',
                r'\n\n'
            ]
            
            end_pos = len(text)
            for end_pattern in end_patterns:
                end_match = re.search(end_pattern, text[start_pos:])
                if end_match:
                    end_pos = start_pos + end_match.start()
                    break
            
            summary = text[start_pos:end_pos].strip()
            if len(summary) > 20:  # Ensure it's substantial
                print(f"   ‚úÖ Found summary: {summary[:50]}...")
                return summary
    
    print("   ‚ö†Ô∏è Summary not found")
    return ""

def extract_location(text):
    """
    Extract location/address from resume text
    """
    print("üìç Extracting location...")
    
    # Location patterns
    location_patterns = [
        r'(?i)(?:address|location)\s*:?\s*([^,\n]+(?:,\s*[^,\n]+)*)',
        r'([A-Z][a-z]+,\s*[A-Z][a-z]+,?\s*[A-Z][a-z]+)',  # City, State, Country
        r'([A-Z][a-z]+,\s*[A-Z][a-z]+)',  # City, State
    ]
    
    for pattern in location_patterns:
        matches = re.findall(pattern, text)
        if matches:
            location = matches[0]
            if isinstance(location, tuple):
                location = location[0]
            print(f"   ‚úÖ Found location: {location}")
            return location.strip()
    
    print("   ‚ö†Ô∏è Location not found")
    return ""

print("‚úÖ Advanced extraction functions defined!")

In [None]:
def parse_resume_comprehensive(file_path):
    """
    COMPREHENSIVE resume parsing pipeline that extracts ALL structured information.
    Returns complete JSON with all fields as per the specification.
    """
    print("="*80)
    print("üéØ STARTING COMPREHENSIVE RESUME PARSING PIPELINE")
    print("="*80)

    try:
        # Step 1: Extract raw text
        raw_text = extract_text_from_resume(file_path)
        if not raw_text:
            return {"error": "Failed to extract text from resume"}

        # Step 2: Clean and normalize text
        cleaned_text = clean_and_normalize_text(raw_text)
        if not cleaned_text:
            return {"error": "Text cleaning resulted in empty content"}

        # Step 3: Extract ALL entities
        print("\nüîç Extracting ALL structured information...")
        print("-" * 50)

        # Basic Information
        name = extract_name(cleaned_text)
        email = extract_email(cleaned_text)
        phone = extract_mobile(cleaned_text)
        linkedin = extract_linkedin(cleaned_text)
        github = extract_github(cleaned_text)
        portfolio = extract_portfolio(cleaned_text)
        location = extract_location(cleaned_text)
        summary = extract_summary(cleaned_text)

        # Complex Information
        education = extract_education(cleaned_text)
        skills = extract_skills(cleaned_text)
        experience = extract_experience(cleaned_text)
        projects = extract_projects(cleaned_text)
        certifications = extract_certifications(cleaned_text)

        # Step 4: Create comprehensive structured output
        result = {
            "name": name,
            "email": email,
            "phone": phone,
            "linkedin": linkedin,
            "github": github,
            "portfolio": portfolio,
            "education": education,
            "skills": skills,
            "experience": experience,
            "projects": projects,
            "certifications": certifications,
            "summary": summary,
            "location": location
        }

        print("\n" + "="*80)
        print("‚úÖ COMPREHENSIVE PARSING COMPLETED SUCCESSFULLY")
        print("="*80)
        print(f"üìä EXTRACTION SUMMARY:")
        print(f"   üë§ Name: {name}")
        print(f"   üìß Email: {email}")
        print(f"   üì± Phone: {phone}")
        print(f"   üîó LinkedIn: {linkedin}")
        print(f"   üêô GitHub: {github}")
        print(f"   üåê Portfolio: {portfolio}")
        print(f"   üìç Location: {location}")
        print(f"   üéì Education: {len(education)} entries")
        print(f"   üõ†Ô∏è Skills: {len(skills)} skills")
        print(f"   üíº Experience: {len(experience)} entries")
        print(f"   üöÄ Projects: {len(projects)} entries")
        print(f"   üèÜ Certifications: {len(certifications)} entries")
        print(f"   üìù Summary: {'Yes' if summary else 'No'}")
        print("="*80)

        return result

    except Exception as e:
        print(f"\n‚ùå COMPREHENSIVE PARSING FAILED: {str(e)}")
        import traceback
        traceback.print_exc()
        return {"error": f"Parsing failed: {str(e)}"}

# Keep the old function for backward compatibility
def parse_resume(file_path):
    """
    Legacy function - calls the comprehensive parser
    """
    return parse_resume_comprehensive(file_path)

print("‚úÖ Comprehensive resume parser defined!")

In [5]:
def parse_resume(file_path):
    """
    Main resume parsing pipeline that combines all extraction functions.
    Returns structured JSON with full_name, email, mobile_number, and skills.
    """
    print("="*60)
    print("üéØ STARTING RESUME PARSING PIPELINE")
    print("="*60)

    try:
        # Step 1: Extract raw text
        raw_text = extract_text_from_resume(file_path)
        if not raw_text:
            return {"error": "Failed to extract text from resume"}

        # Step 2: Clean and normalize text
        cleaned_text = clean_and_normalize_text(raw_text)
        if not cleaned_text:
            return {"error": "Text cleaning resulted in empty content"}

        # Step 3: Extract entities
        print("\nüîç Extracting structured information...")

        full_name = extract_name(cleaned_text)
        email = extract_email(cleaned_text)
        mobile_number = extract_mobile(cleaned_text)
        skills = extract_skills(cleaned_text)

        # Step 4: Create structured output
        result = {
            "full_name": full_name,
            "email": email,
            "mobile_number": mobile_number,
            "skills": skills
        }

        print("\n‚úÖ PARSING COMPLETED SUCCESSFULLY")
        print("="*60)
        print(f"üìä SUMMARY:")
        print(f"   Name: {full_name}")
        print(f"   Email: {email}")
        print(f"   Mobile: {mobile_number}")
        print(f"   Skills Count: {len(skills)}")
        print("="*60)

        return result

    except Exception as e:
        print(f"\n‚ùå PARSING FAILED: {str(e)}")
        return {"error": f"Parsing failed: {str(e)}"}

print("‚úÖ Main parse_resume function defined!")


‚úÖ Main parse_resume function defined!


In [None]:
from google.colab import files
import json
from datetime import datetime

def format_resume_output(result):
    """
    Format the resume parsing result for better display
    """
    if "error" in result:
        return result
    
    # Create formatted output matching the specification
    formatted_result = {
        "name": result.get("name", ""),
        "email": result.get("email", ""),
        "phone": result.get("phone", ""),
        "linkedin": result.get("linkedin", ""),
        "github": result.get("github", ""),
        "portfolio": result.get("portfolio", ""),
        "education": result.get("education", []),
        "skills": result.get("skills", []),
        "experience": result.get("experience", []),
        "projects": result.get("projects", []),
        "certifications": result.get("certifications", []),
        "summary": result.get("summary", ""),
        "location": result.get("location", "")
    }
    
    return formatted_result

print("üéØ COMPREHENSIVE RESUME PARSER")
print("="*60)
print("üì§ Upload your resume PDF file to extract ALL information:")
print("   ‚Ä¢ Personal Information (Name, Email, Phone, LinkedIn, GitHub)")
print("   ‚Ä¢ Education (Degree, Institution, Year, Grade)")
print("   ‚Ä¢ Work Experience (Company, Role, Dates, Responsibilities)")
print("   ‚Ä¢ Projects (Name, Description, Technologies, Links)")
print("   ‚Ä¢ Skills (Technical and Soft Skills)")
print("   ‚Ä¢ Certifications (Name, Issuer, Year)")
print("   ‚Ä¢ Summary/Objective")
print("   ‚Ä¢ Location/Address")
print("="*60)

# Upload files
uploaded = files.upload()

if uploaded:
    print("\nüöÄ Processing uploaded resume(s) with COMPREHENSIVE extraction...")
    print("="*60)

    for file_name in uploaded.keys():
        print(f"\nüìã Processing: {file_name}")
        
        # Parse resume with comprehensive extraction
        result = parse_resume_comprehensive(file_name)
        
        # Format the result
        formatted_result = format_resume_output(result)
        
        print(f"\nüìÑ COMPREHENSIVE RESUME DATA (JSON FORMAT):")
        print("="*60)
        print(json.dumps(formatted_result, indent=2, ensure_ascii=False))
        print("="*60)

        # Save results to JSON file with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_filename = f"{file_name.replace('.pdf', '')}_parsed_{timestamp}.json"
        
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(formatted_result, f, indent=2, ensure_ascii=False)

        print(f"üíæ Results saved to: {output_filename}")
        
        # Display summary statistics
        if "error" not in formatted_result:
            print(f"\nüìä EXTRACTION STATISTICS:")
            print(f"   ‚úÖ Personal Info: {sum(1 for field in ['name', 'email', 'phone'] if formatted_result.get(field))}/3")
            print(f"   ‚úÖ Social Links: {sum(1 for field in ['linkedin', 'github', 'portfolio'] if formatted_result.get(field))}/3")
            print(f"   ‚úÖ Education Entries: {len(formatted_result.get('education', []))}")
            print(f"   ‚úÖ Skills Found: {len(formatted_result.get('skills', []))}")
            print(f"   ‚úÖ Experience Entries: {len(formatted_result.get('experience', []))}")
            print(f"   ‚úÖ Projects: {len(formatted_result.get('projects', []))}")
            print(f"   ‚úÖ Certifications: {len(formatted_result.get('certifications', []))}")
            print(f"   ‚úÖ Summary: {'Yes' if formatted_result.get('summary') else 'No'}")
            print(f"   ‚úÖ Location: {'Yes' if formatted_result.get('location') else 'No'}")
        
        # Optionally download the JSON file
        print(f"\nüíæ To download the JSON file, uncomment the line below:")
        print(f"# files.download('{output_filename}')")

else:
    print("‚ùå No files uploaded!")

print("\n‚úÖ COMPREHENSIVE RESUME PARSING COMPLETE!")
print("üéØ The parser now extracts ALL resume information as per specification!")

üì§ Upload your resume PDF file:


Saving Soham_2025.pdf to Soham_2025.pdf

üöÄ Processing uploaded resume(s)...

üìã Processing: Soham_2025.pdf
üéØ STARTING RESUME PARSING PIPELINE
üìÑ Processing: Soham_2025.pdf
   üìä Text length: 2474, Images: 0
   üîç PDF Type: DIGITAL
‚úÖ Extraction complete: 2476 characters
üßπ Cleaning and normalizing text...
   ‚úÖ Cleaned text length: 2388 chars

üîç Extracting structured information...

‚ùå PARSING FAILED: name 'extract_name' is not defined

üìÑ PARSED RESUME DATA:
{
  "error": "Parsing failed: name 'extract_name' is not defined"
}
üíæ Results saved to: Soham_2025_parsed.json

‚úÖ Processing complete!
