<a href="https://colab.research.google.com/github/KunalGaurav90/pinnacle_02/blob/main/Untitled32.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""
Final, Robust 3D Resume Parser for Google Colab

This definitive version uses intelligent parsing, summarization, and data
cleaning, with robust name detection to provide a highly accurate and
cleanly formatted output in the 3D card.
"""
# Step 1: Install required libraries silently
!pip install PyPDF2 python-docx spacy -q
!python -m spacy download en_core_web_sm -q


# Step 2: Import all necessary modules
import os
import re
import docx
import PyPDF2
import spacy
from IPython.display import display, HTML
from google.colab import files


# Load the spaCy model for NLP tasks
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("spaCy model not found. Re-run the cell or check installation.")
    nlp = spacy.blank("en")


# Step 3: Define text extraction and cleaning functions
def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text


def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])


def clean_text(text):
    """Removes extra whitespace and cleans up text for display."""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    return text.strip()


# Step 4: Define advanced parsing and formatting functions

# --- NEW: ROBUST NAME PARSING LOGIC ---
def get_name_from_email(email):
    """Helper function to derive a name from an email address."""
    if not email or email == "Not Found" or "@" not in email:
        return None
    try:
        local_part = email.split('@')[0]
        local_part = re.sub(r'[._-]', ' ', local_part)
        local_part = re.sub(r'([a-z])([A-Z])', r'\1 \2', local_part)
        name_candidate = local_part.title()
        if 1 < len(name_candidate.split()) < 4:
            return name_candidate
    except Exception:
        return None
    return None

def parse_name_final(doc, email, all_headings):
    """
    A robust, multi-step function to find the name.
    1. Tries to find a PERSON entity, filtering out common non-name words.
    2. Falls back to capitalized words at the start of the document.
    3. As a last resort, infers the name from the email address.
    """
    non_name_keywords = ['cloud', 'university', 'technologies', 'solutions', 'inc', 'llc', 'resume', 'cv']

    # 1. Try spaCy's PERSON entity recognition
    for ent in doc.ents:
        if ent.label_ == "PERSON" and len(ent.text.split()) < 4:
            if not any(keyword in ent.text.lower() for keyword in non_name_keywords):
                return ent.text.strip()

    # 2. Fallback to the first few lines of text
    for line in doc.text.split('\n')[:5]:
        line = line.strip()
        if (1 < len(line.split()) < 4 and
            all(word.istitle() for word in line.split()) and
            line.lower() not in all_headings):
            if not any(keyword in line.lower() for keyword in non_name_keywords):
                return line

    # 3. Final fallback: derive name from email address
    name_from_email = get_name_from_email(email)
    if name_from_email:
        return name_from_email

    return "Not Found"


def parse_contact_smart(text):
    email = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    phone = re.search(r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text)
    return {
        "email": email.group(0) if email else "Not Found",
        "phone": phone.group(0).strip() if phone else "Not Found"
    }


def parse_section_smart(text, start_keywords, end_keywords):
    """Extracts content between a start keyword and the next section's keyword."""
    try:
        start_regex = r'\b(' + '|'.join(start_keywords) + r')\b'
        match = re.search(start_regex, text, re.IGNORECASE)
        if not match:
            return "Not Found"
        start_index = match.end()
        end_index = len(text)
        for end_keyword in end_keywords:
            next_match = re.search(r'\b' + end_keyword + r'\b', text[start_index:], re.IGNORECASE)
            if next_match and (next_match.start() + start_index) < end_index:
                end_index = next_match.start() + start_index
        section_text = text[start_index:end_index]
        return section_text.strip()
    except Exception:
        return "Not Found"


def summarize_experience(text, num_sentences=2):
    """Summarizes the experience section into key bullet points."""
    if not text or text == "Not Found":
        return "Not Found"
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 20]
    summary_points = sentences[:num_sentences]
    return "<br>".join([f"• {clean_text(point)}" for point in summary_points])


def parse_skills_cleanly(text):
    """Identifies and extracts a clean, comma-separated list of skills."""
    if not text or text == "Not Found":
        return "Not Found"
    known_skills = [
        'Python', 'Java', 'C++', 'C#', 'JavaScript', 'SQL', 'Tableau', 'Power BI',
        'Machine Learning', 'TensorFlow', 'PyTorch', 'Scikit-learn', 'NLP', 'Informatica',
        'Data Wrangling', 'Business Analysis', 'Predictive Modelling', 'AWS', 'Azure',
        'Docker', 'Git', 'React', 'Angular', 'Vue', 'Node.js', 'Excel', 'Einstein Analytics'
    ]
    found_skills = set()
    for skill in known_skills:
        if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE):
            found_skills.add(skill)
    # Split by common separators like comma, semicolon, newline
    potential_skills = re.split(r'[,;\n]', text, re.IGNORECASE)
    for p_skill in potential_skills:
        p_skill = p_skill.strip()
        if p_skill and len(p_skill.split()) < 5: # Basic filtering for short phrases
             found_skills.add(p_skill.title()) # Add to set, title case for consistency

    return ", ".join(sorted(list(found_skills))) # Return as a sorted comma-separated string

# --- NEW: Project and Certification Parsing (simplified) ---
def parse_list_section(text):
    """Parses sections that are likely bulleted lists (Projects, Certifications)."""
    if not text or text == "Not Found":
        return "Not Found"
    # Split by common list item indicators (bullets, numbers, newlines followed by caps)
    items = re.split(r'[\n\*\-\d\.]+\s*', text)
    cleaned_items = [clean_text(item).strip() for item in items if clean_text(item).strip()]
    return "<br>".join([f"• {item}" for item in cleaned_items if len(item) > 10]) # Filter out short items

# Step 5: Orchestrate parsing and structure output
def parse_resume(file_path):
    """
    Parses a resume file (PDF or DOCX) and extracts key information.
    Uses robust methods for name, contact, and section parsing.
    """
    try:
        if file_path.lower().endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
        elif file_path.lower().endswith('.docx'):
            text = extract_text_from_docx(file_path)
        else:
            return {"Error": "Unsupported file type. Please upload PDF or DOCX."}

        if not text:
            return {"Error": "Could not extract text from the document."}

        doc = nlp(text)

        # Define potential section headings to aid parsing
        all_headings = [
            'education', 'experience', 'work experience', 'skills', 'technical skills',
            'projects', 'certifications', 'awards', 'summary', 'about', 'contact'
        ]

        contact_info = parse_contact_smart(text)
        email = contact_info.get("email", "Not Found")

        # Pass the spaCy doc and all_headings for better context in name parsing
        name = parse_name_final(doc, email, all_headings)

        # Use defined headings to parse sections
        education = parse_section_smart(text, ['Education'], ['Experience', 'Skills', 'Projects', 'Certifications', 'Awards', 'Summary', 'About'])
        experience_raw = parse_section_smart(text, ['Experience', 'Work Experience'], ['Education', 'Skills', 'Projects', 'Certifications', 'Awards', 'Summary', 'About'])
        experience_summary = summarize_experience(experience_raw) # Summarize experience
        skills = parse_section_smart(text, ['Skills', 'Technical Skills'], ['Education', 'Experience', 'Projects', 'Certifications', 'Awards', 'Summary', 'About'])
        skills_cleaned = parse_skills_cleanly(skills) # Clean and format skills
        projects_raw = parse_section_smart(text, ['Projects'], ['Education', 'Experience', 'Skills', 'Certifications', 'Awards', 'Summary', 'About'])
        projects_cleaned = parse_list_section(projects_raw) # Clean and format projects
        certifications_raw = parse_section_smart(text, ['Certifications', 'Awards'], ['Education', 'Experience', 'Skills', 'Projects', 'Summary', 'About'])
        certifications_cleaned = parse_list_section(certifications_raw) # Clean and format certifications


        parsed_data = {
            "Name": name,
            "Email": email,
            "Phone": contact_info.get("phone", "Not Found"),
            "Education": clean_text(education) if education != "Not Found" else "Not Found",
            "Experience": experience_summary, # Use summarized experience
            "Skills": skills_cleaned, # Use cleaned skills
            "Projects": projects_cleaned, # Use cleaned projects
            "Certifications": certifications_cleaned # Use cleaned certifications
        }

        return parsed_data

    except Exception as e:
        return {"Error": f"An error occurred during parsing: {e}"}


# Step 6: Generate 3D Card HTML
def create_3d_card(data):
    """Generates HTML for a 3D flip card based on parsed resume data."""
    if "Error" in data:
        return f"<div style='color: red; font-weight: bold;'>{data['Error']}</div>"

    # Basic styling for the card
    css = """
    .card-container {
        width: 300px;
        height: 450px;
        perspective: 1000px;
        margin: 20px auto;
    }

    .card {
        width: 100%;
        height: 100%;
        position: relative;
        transform-style: preserve-3d;
        transition: transform 0.8s cubic-bezier(0.175, 0.885, 0.32, 1.275);
    }

    .card:hover {
        transform: rotateY(180deg);
    }

    .card-face {
        position: absolute;
        width: 100%;
        height: 100%;
        backface-visibility: hidden;
        border-radius: 10px;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
        padding: 20px;
        box-sizing: border-box;
        display: flex;
        flex-direction: column;
        justify-content: space-around;
        align-items: center;
        text-align: center;
    }

    .card-front {
        background-color: #f0f0f0;
        color: #333;
    }

    .card-back {
        background-color: #333;
        color: #f0f0f0;
        transform: rotateY(180deg);
        overflow-y: auto; /* Add scroll for long content */
        text-align: left;
        padding: 20px;
    }

    .card-back::-webkit-scrollbar {
        width: 8px;
    }

    .card-back::-webkit-scrollbar-track {
        background: #555;
        border-radius: 10px;
    }

    .card-back::-webkit-scrollbar-thumb {
        background: #888;
        border-radius: 10px;
    }

    .card-back::-webkit-scrollbar-thumb:hover {
        background: #bbb;
    }


    .card-title {
        font-size: 24px;
        font-weight: bold;
        margin-bottom: 10px;
    }

     .card-section-title {
        font-size: 16px;
        font-weight: bold;
        margin-top: 10px;
        margin-bottom: 5px;
        border-bottom: 1px solid #f0f0f0;
        padding-bottom: 2px;
    }

    .card-text {
        font-size: 14px;
        margin-bottom: 5px;
    }
     .card-list-item {
         font-size: 14px;
         margin-bottom: 5px;
         list-style-type: disc;
         margin-left: 20px;
         text-align: left;
     }
    """

    # Format data for the back of the card
    back_content = f"""
    <div class="card-section-title">Contact</div>
    <div class="card-text">Email: {data.get('Email', 'Not Found')}</div>
    <div class="card-text">Phone: {data.get('Phone', 'Not Found')}</div>

    <div class="card-section-title">Education</div>
    <div class="card-text">{data.get('Education', 'Not Found')}</div>

    <div class="card-section-title">Experience Summary</div>
    <div class="card-text">{data.get('Experience', 'Not Found')}</div>

    <div class="card-section-title">Skills</div>
    <div class="card-text">{data.get('Skills', 'Not Found')}</div>

    <div class="card-section-title">Projects</div>
    <div class="card-text">{data.get('Projects', 'Not Found')}</div>

    <div class="card-section-title">Certifications</div>
    <div class="card-text">{data.get('Certifications', 'Not Found')}</div>
    """


    html = f"""
    <style>{css}</style>
    <div class="card-container">
        <div class="card">
            <div class="card-face card-front">
                <div class="card-title">{data.get('Name', 'Not Found')}</div>
                 <!-- You can add an image or a brief summary on the front -->
                 <img src="https://via.placeholder.com/150" alt="Profile Image" style="border-radius: 50%; margin-bottom: 10px;">
                <div class="card-text">Hover to see details</div>
            </div>
            <div class="card-face card-back">
                {back_content}
            </div>
        </div>
    </div>
    """
    return html

# Step 7: Main Execution Block
def main():
    """Main function to handle file upload, parsing, and display."""
    print("Please upload your resume (PDF or DOCX).")
    uploaded = files.upload()

    for file_name in uploaded.keys():
        print(f'Uploaded file: {file_name}')
        parsed_extended_data = parse_resume(file_name)
        if "Error" in parsed_extended_data:
            print(parsed_extended_data["Error"])
        else:
            print("\n--- Parsed Resume Data ---")
            for key, value in parsed_extended_data.items():
                print(f"{key}: {value}")

            # Create and display the 3D card
            card_html = create_3d_card(parsed_extended_data)
            display(HTML(card_html))


# Execute the main function
if __name__ == "__main__":
    main()

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/12.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/12.8 MB[0m [31m8.5 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/12.8 MB[0m [31m40.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m8.4/12.8 MB[0m [31m80.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m12.8/12.8 MB[0m [31m171.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restar

Saving fank_resume.pdf to fank_resume (2).pdf
Uploaded file: fank_resume (2).pdf

--- Parsed Resume Data ---
Name: Missy Frank
Email: MissyFrank@email.com
Phone: (703) 555-3334
Education: Not Found
Experience: • by training model to predict duplicate questions with 72% accuracy using natural language processing (NLP), advanced feature engineering.<br>• and sklearn machine learning (ML) pipelines.
Skills: Data Wrangling, Einstein, Informatica, Machine Learning, Predictive Modelling, Python, SQL, Sql Tableau, Tableau
Projects: • " Used SQL and SOQL to cxtract, analysc, and interpret<br>• M data points: defincd<br>• key metrics and visualised data using Excel and Tableau dashboards<br>• "Enhanced demand forecasting and streamlined data flow for<br>• + interfaces using<br>• Informatica Cloud, cleaning and correlating<br>• K+ customer data records with $<br>• in sales using python and sales analytics<br>• " Created predictive models to prioritise campaigns and project customer lifetime<br>•