In [1]:
!pip install spacy PyPDF2 python-docx textract
!python -m spacy download en_core_web_sm

Collecting python-docx
  Using cached python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting textract
  Using cached textract-1.6.5-py3-none-any.whl.metadata (2.5 kB)
Requested textract from https://files.pythonhosted.org/packages/6b/3e/ac16b6bf28edf78296aea7d0cb416b49ed30282ac8c711662541015ee6f3/textract-1.6.5-py3-none-any.whl has invalid metadata: .* suffix can only be used with `==` or `!=` operators
    extract-msg (<=0.29.*)
                 ~~~~~~~^
Please use pip<24.1 if you need to use this version.[0m[33m
[0m  Using cached textract-1.6.4.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[3 lines of output][0m
  [31m   [0m error in textract setup command: 'install_requires' must be a string or list of strings containing valid project/version requirement specifiers

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
!pip install PyPDF2 ipywidgets 



In [4]:
#all imports 
import re
import spacy
import os
from PyPDF2 import PdfReader
import ipywidgets as widgets
from IPython.display import display
import tempfile
import os


try:
    import docx
except ImportError:
    docx = None

try:
    import textract
except ImportError:
    textract = None

nlp = spacy.load("en_core_web_sm")

In [5]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.lower()

def extract_text_from_docx(docx_path):
    if docx is None:
        raise ImportError("python-docx not installed. Install with `pip install python-docx`")
    doc = docx.Document(docx_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return "\n".join(full_text).lower()

def extract_text_from_doc(doc_path):
    if textract is None:
        raise ImportError("textract not installed. Install with `pip install textract`")
    text = textract.process(doc_path).decode('utf-8')
    return text.lower()

def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    elif ext == ".doc":
        return extract_text_from_doc(file_path)
    else:
        raise ValueError(f"Unsupported file extension: {ext}. Supported: pdf, docx, doc")


### Dynamic skill extraction from JD text

In [6]:
def extract_skills_from_jd(jd_text):
    doc = nlp(jd_text)

    candidates = set()
    for chunk in doc.noun_chunks:
        chunk_text = chunk.text.lower().strip()
        # Filter unwanted tokens early
        if (len(chunk_text) > 2 and
            not any(word in chunk_text for word in ["team", "role", "experience", "knowledge", "security engineer", "years", "year", "remote"]) and
            not chunk_text.startswith(("*", "#")) and
            not re.search(r'\d', chunk_text) and
            chunk_text.isalpha() or ' ' in chunk_text):
            candidates.add(chunk_text)

    for ent in doc.ents:
        ent_text = ent.text.lower().strip()
        if (len(ent_text) > 2 and
            not ent_text.startswith(("*", "#")) and
            not re.search(r'\d', ent_text) and
            ent_text.isalpha() or ' ' in ent_text):
            candidates.add(ent_text)

    proficiency_matches = re.findall(r'(?:proficiency|experience|knowledge) (?:in|with) ([\w\s,/.-]+)', jd_text.lower())
    for match in proficiency_matches:
        for skill in re.split(r'[,\s/.-]+', match):
            skill = skill.strip()
            if (skill and len(skill) > 2 and
                not skill.startswith(("*", "#")) and
                not re.search(r'\d', skill) and
                (skill.isalpha() or ' ' in skill)):
                candidates.add(skill)

    stopwords = set([
        "team", "role", "experience", "knowledge", "security", "application",
        "system", "systems", "product", "products", "and", "or", "the", "in",
        "on", "with", "for", "of", "to", "be", "have"
    ])

    filtered_skills = sorted([c for c in candidates if c not in stopwords])

    return filtered_skills


### Extract criteria (skills, education, experience) from JD

In [7]:
def extract_criteria_from_jd(jd_text):
    jd_lower = jd_text.lower()

    education_keywords = ["bachelor", "b.s.", "bsc", "computer science", "information security"]
    education_required = [e for e in education_keywords if e in jd_lower]

    exp_match = re.findall(r'(\d+)\s*-\s*(\d+)\s+years', jd_lower)
    if exp_match:
        min_exp = int(exp_match[0][0])
        max_exp = int(exp_match[0][1])
    else:
        single_exp = re.findall(r'(\d+)\s+years', jd_lower)
        min_exp = int(single_exp[0]) if single_exp else 0
        max_exp = 50

    required_skills = extract_skills_from_jd(jd_text)

    print("\n JD Criteria Extracted:")
    print(f"- Skills (top 15): {required_skills[:15]}")
    print(f"- Education: {education_required}")
    print(f"- Experience: {min_exp}–{max_exp} years\n")

    return {
        "required_skills": required_skills,
        "education_required": education_required,
        "min_years_experience": min_exp,
        "max_years_experience": max_exp
    }


### Evaluate the CV

In [8]:
def evaluate_cv(cv_text, criteria):
    exp_years = extract_experience_years(cv_text)

    education_matched = any(edu in cv_text for edu in criteria["education_required"])

    matched_skills = [skill for skill in criteria["required_skills"] if skill in cv_text]

    skill_score = len(matched_skills) / max(len(criteria["required_skills"]), 1)
    exp_score = 0
    if exp_years >= criteria["min_years_experience"]:
        exp_score = 1 if exp_years <= criteria["max_years_experience"] else 0.8

    education_score = 1 if education_matched else 0

    final_score = (skill_score * 0.6) + (exp_score * 0.3) + (education_score * 0.1)

    print(" CV Evaluation Results:")
    print(f"Skills matched: {matched_skills}")
    print(f"Skill score: {skill_score:.2f}")
    print(f"Experience (years): {exp_years}")
    print(f"Experience score: {exp_score:.2f}")
    print(f"Education matched: {education_matched}")
    print(f"Education score: {education_score}")
    print(f"Final Score (%): {final_score * 100:.1f}")

    return final_score


### Main interactive flow to input JD and CV file path, then run evaluation

In [10]:
import re

def extract_experience_years(text):
    """
    Extracts the maximum years of experience mentioned in the CV text.
    Handles patterns like:
    - '10+ years'
    - 'over 10 years'
    - '5 to 10 years'
    - '7 yrs'
    - '3 years of experience'
    """
    patterns = [
        r'(\d+)\s*\+\s*years?',            # 10+ years
        r'over\s+(\d+)\s*years?',          # over 10 years
        r'(\d+)\s*[-to]+\s*(\d+)\s*years?',# 5-10 years or 5 to 10 years
        r'(\d+)\s*yrs?\.?',                # 7 yrs, 7 yr, 7 yrs.
        r'(\d+)\s*years?',                 # 3 years, 4 year
    ]

    text = text.lower()
    years_found = []

    for pattern in patterns:
        matches = re.findall(pattern, text)
        for match in matches:
            if isinstance(match, tuple):
                # For range matches like 5-10 years
                nums = [int(num) for num in match if num.isdigit()]
                if nums:
                    years_found.append(max(nums))  # Use max of range
            else:
                if match.isdigit():
                    years_found.append(int(match))

    return max(years_found) if years_found else 0


print("Paste the Job Description text (finish input with empty line):")
jd_lines = []
while True:
    line = input()
    if line.strip() == "":
        break
    jd_lines.append(line)
jd_text = "\n".join(jd_lines)

criteria = extract_criteria_from_jd(jd_text)

print("\nPlease upload the CV file (pdf, docx, doc):")

upload_widget = widgets.FileUpload(accept='.pdf,.docx,.doc', multiple=False)
process_button = widgets.Button(description="Process CV")
output = widgets.Output()

display(upload_widget, process_button, output)

def on_button_clicked(b):
    with output:
        output.clear_output()
        if len(upload_widget.value) == 0:
            print("No file uploaded yet. Please upload a CV file first.")
            return

        try:
            # Newer ipywidgets: dict with metadata key
            uploaded_file = list(upload_widget.value.values())[0]
            filename = uploaded_file.get('metadata', {}).get('name')
            content = uploaded_file['content']
            if filename is None:
                # fallback if metadata missing
                filename = uploaded_file.get('name', 'uploaded_file')
        except Exception:
            # Older ipywidgets: might be a tuple/list of dicts
            uploaded_file = upload_widget.value[0]
            filename = uploaded_file.get('name', 'uploaded_file')
            content = uploaded_file['content']

        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
            tmp_file.write(content)
            tmp_file_path = tmp_file.name

        print(f"\nExtracting text from uploaded file: {filename} ...")
        try:
            cv_text = extract_text_from_file(tmp_file_path)
        except Exception as e:
            print(f"Error extracting CV text: {e}")
            return

        evaluate_cv(cv_text, criteria)


process_button.on_click(on_button_clicked)


Paste the Job Description text (finish input with empty line):


 Job description • Lead and manage the end-to-end functioning of the SOC team, ensuring 24x7 coverage. • Oversee daily operations, incident response quality, and compliance with SLAs.Work closely with the SOC Manager to prepare periodic threat reports, executive dashboards, and compliance status updates. • Develop and maintain SOC playbooks, SOPs, and reporting dashboards. • Conduct regular review meetings, threat landscape briefings, and knowledge sharing sessions. • Act as point of contact for critical escalations, audits, and client reporting. • Coordinate with OEMs, security architects, and client IT teams for policy updates and improvements. • Train and mentor L1 and L2 SOC Analysts to build internal capabilities. • Ensure policies for endpoints, DLP, firewall rules, and SOAR playbooks are appropriately enabled and updated as per approved change management. • Assist in root cause analysis (RCA) for incidents and recommend corrective actions. • Document incidents, resolutions, and 


✅ JD Criteria Extracted:
- Skills (top 15): ['24x7 coverage', '7–10 years', 'a lead/managerial role', 'approved change management', 'assist', 'at least 2–3 years', 'audits', 'ccsp', 'cism', 'cissp', 'client', 'client reporting', 'communication', 'compliance', 'compliance frameworks']
- Education: ['bachelor', 'computer science']
- Experience: 10–50 years


Please upload the CV file (pdf, docx, doc):


FileUpload(value=(), accept='.pdf,.docx,.doc', description='Upload')

Button(description='Process CV', style=ButtonStyle())

Output()