In [1]:
!pip install Flask




In [2]:
from flask import Flask, render_template, request
import spacy
import PyPDF2
import re
from threading import Thread

# Initialize Flask app
app = Flask(__name__)

# Load NLP model for entity extraction
nlp = spacy.load("en_core_web_sm")

# Skill and education keywords
SKILL_KEYWORDS = {"Python", "Java", "C++", "SQL", "Machine Learning", "NLP", "React", "Node.js", "Docker", "AWS", "JavaScript"}
EDUCATION_KEYWORDS = {"BSc", "MSc", "PhD", "Bachelor", "Master", "Degree", "University", "College", "Engineering", "Computer Science"}

# Normalize extracted text from PDF to handle unwanted characters
def normalize_text(text):
    return text.replace('\u2022', '').replace('–', '-').replace('\r', '').strip()

# PDF Text Extraction
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
            return normalize_text(text)
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

# Extract demographic information (Sex and Age)
def extract_demographics(text):
    # Sex (Male, Female)
    sex_match = re.search(r'\b(Male|Female|M|F)\b', text, re.IGNORECASE)
    sex = sex_match.group(1).capitalize() if sex_match else "Not Found"

    # Age (Supports patterns like Age=38, Age: 38, age 38, etc.)
    age_match = re.search(r'\bAge\s*[:=]?\s*(\d{1,3})\b', text, re.IGNORECASE)
    age = age_match.group(1) if age_match else "Not Found"  # Only set if matched

    return {"Sex": sex, "Age": age}

# Extract contact information (Phone and Email)
def extract_contact_info(text):
    phone = re.findall(r'\+?\d[\d\s\-]{8,}\d', text)
    email = re.findall(r'[\w\.-]+@[\w\.-]+', text)
    return {
        "Phone": phone[0] if phone else "Not Found",
        "Email": email[0] if email else "Not Found"
    }

# Extract work experience (years)
def extract_experience(text):
    exp_patterns = re.findall(r'(\d+)\s*(?:years?|yrs?)', text, re.IGNORECASE)
    years = max([int(x) for x in exp_patterns], default=0)
    return {"Experience (years)": years if years else "Not Mentioned"}

# Extract CGPA (Supports patterns like CGPA=3.3, CGPA 3.3, etc.)
def extract_cgpa(text):
    match = re.search(r'\bCGPA\s*[:=]?\s*([0-4]\.\d{1,2})', text, re.IGNORECASE)
    return match.group(1) if match else "Not Found"

# Extract languages (Supports Languages: Oromic, English, etc.)
def extract_languages(text):
    match = re.search(r'\bLanguages?\s*[:=]?\s*([a-zA-Z,\s]+)', text, re.IGNORECASE)
    if match:
        raw_languages = match.group(1)
        languages = re.split(r',\s*', raw_languages.strip())
        languages = [lang.capitalize() for lang in languages if lang]
        return languages if languages else ["Not Found"]
    return ["Not Found"]

# Main information extractor (Combines all info)
def extract_information(text):
    doc = nlp(text)

    # Extract name using spaCy (or first line if no name is detected)
    name = None
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            name = ent.text
            break
    if not name:
        lines = text.strip().split("\n")
        if lines and 1 < len(lines[0].split()) <= 4:
            name = lines[0].strip()

    # Extract skills, education, demographics, contact, experience, CGPA, and languages
    skills = [token.text for token in doc if token.text in SKILL_KEYWORDS]
    education = [line for line in text.split('\n') if any(keyword in line for keyword in EDUCATION_KEYWORDS)]
    education = list(set(education))  # Remove duplicates

    demographics = extract_demographics(text)
    contact = extract_contact_info(text)
    experience = extract_experience(text)
    cgpa = extract_cgpa(text)
    languages = extract_languages(text)

    # Return all extracted information
    return {
        "Name": name or "Not Found",
        "Sex": demographics["Sex"],
        "Age": demographics["Age"],
        "Email": contact["Email"],
        "Phone": contact["Phone"],
        "Skills": skills or "Not Mentioned",
        "Education": education[0] if education else "Not Mentioned",
        "Experience (years)": experience["Experience (years)"],
        "CGPA": cgpa,
        "Languages": ", ".join(languages)
    }

# Flask route to handle file upload and extract information
@app.route('/', methods=['GET', 'POST'])
def index():
    info = None
    if request.method == 'POST':
        file = request.files.get('resume')
        if file:
            pdf_path = "uploaded_resume.pdf"
            file.save(pdf_path)
            text = extract_text_from_pdf(pdf_path)
            info = extract_information(text)
    return render_template('index.html', info=info)

# Run Flask app with threading
def run_app():
    app.run(debug=True, use_reloader=False)

if __name__ == '__main__':
    Thread(target=run_app).start()


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [23/Apr/2025 00:07:32] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:07:32] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [23/Apr/2025 00:08:18] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:28:23] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:28:51] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:31:06] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:36:11] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:37:40] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:37:55] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:41:10] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:41:10] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:42:48] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:43:12] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:44:51] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Apr/2025 00:46:14] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Ap