In [2]:
import sys
import os
from pathlib import Path

# Add the parent directory to the Python path so we can import from parsers
# Get current working directory and go up one level to reach project root
current_dir = Path.cwd()
if current_dir.name == 'notebooks':
    parent_dir = current_dir.parent
else:
    parent_dir = current_dir

sys.path.insert(0, str(parent_dir))
print(f"Added to Python path: {parent_dir}")
print(f"Looking for parsers module in: {parent_dir / 'parsers'}")
print(f"Parsers directory exists: {(parent_dir / 'parsers').exists()}")

import json
import openai
import config
from parsers.pdf_parser import pdf_to_text
from parsers.base_parser import ResumeParser
from parsers.gpt_parser import GPTResumeParser

# Prefer config-driven key; fallback to openai.api_key if set elsewhere
if config.OPENAI_API_KEY:
    openai.api_key = config.OPENAI_API_KEY

resumes_dir = parent_dir / "data" / "resumes"
job_desc_path = parent_dir / "data" / "job_description.json"

# Collect PDF texts once
pdf_texts = []
for pdf_file in sorted(resumes_dir.glob("*.pdf")):
    resume_text = pdf_to_text(str(pdf_file))
    if resume_text:
        pdf_texts.append({"resume_id": pdf_file.stem, "text": resume_text})

with open(job_desc_path, "r") as f:
    job_data = json.load(f)

# --- Always run Base Parser ---
base_parser = ResumeParser()
parsed_resumes_base = []
for item in pdf_texts:
    parsed = base_parser.parse(item["text"])
    parsed["resume_id"] = item["resume_id"]
    parsed_resumes_base.append(parsed)

print(f"Base parser parsed {len(parsed_resumes_base)} resumes")
print(f"Job Title: {job_data['title']}")
print(f"Required Skills: {job_data['required_skills']}")
print("\nSample Parsed Resume (Base):")
if parsed_resumes_base:
    print(json.dumps(parsed_resumes_base[0], indent=2))

# Save Base parsed resumes
base_output = {"resumes": parsed_resumes_base, "job_description": job_data}
base_path = parent_dir / "data" / "parsed_resumes.json"
with open(base_path, "w") as f:
    json.dump(base_output, f, indent=2)
print(f"\n✅ Base parsed resumes saved to {base_path}")

# --- Optionally run GPT Parser ---
if (openai.api_key or "").strip():
    print("\n🔑 OpenAI API key detected — also running GPT parser.")
    gpt_parser = GPTResumeParser(api_key=openai.api_key)
    parsed_resumes_gpt = []
    for item in pdf_texts:
        try:
            parsed_gpt = gpt_parser.parse_resume(item["text"]) or {}
        except Exception as e:
            parsed_gpt = {"error": str(e)}
        parsed_gpt["resume_id"] = item["resume_id"]
        parsed_resumes_gpt.append(parsed_gpt)

    gpt_output = {"resumes": parsed_resumes_gpt, "job_description": job_data}
    gpt_path = parent_dir / "data" / "parsed_resumes_gpt.json"
    with open(gpt_path, "w") as f:
        json.dump(gpt_output, f, indent=2)
    print(f"✅ GPT parsed resumes saved to {gpt_path}")
else:
    print("\nℹ️ No OpenAI API key found — GPT parser skipped.")


Added to Python path: c:\Users\wangk\OneDrive\Desktop\resume_matcher
Looking for parsers module in: c:\Users\wangk\OneDrive\Desktop\resume_matcher\parsers
Parsers directory exists: True




Base parser parsed 47 resumes
Job Title: Software Engineer - Backend
Required Skills: ['Python', 'Java', 'SQL', 'REST API Development', 'AWS', 'Docker', 'Git']

Sample Parsed Resume (Base):
{
  "contact": {
    "name": "Wendy Bailey",
    "email": "w.bailey@email.com",
    "phone": "(123) 456-7890",
    "linkedin": null,
    "github": null,
    "location": "Philadelphia, PA",
    "website": null
  },
  "summary": null,
  "skills": {
    "all": [
      "Microservices",
      "SQL",
      "AWS",
      "Python",
      "Management",
      "PostgreSQL",
      "Angular",
      "Kubernetes",
      "Team"
    ],
    "technical": [
      "Microservices",
      "SQL",
      "AWS",
      "Python",
      "PostgreSQL",
      "Angular",
      "Kubernetes"
    ],
    "soft": [
      "Management",
      "Team"
    ],
    "languages": [
      "Python",
      "SQL"
    ],
    "tools": [
      "Kubernetes",
      "Angular",
      "AWS"
    ],
    "databases": [
      "PostgreSQL"
    ]
  },
  "experience

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

✅ GPT parsed resumes saved to c:\Users\wangk\OneDrive\Desktop\resume_matcher\data\parsed_resumes_gpt.json


In [3]:
# Resume Scoring and Matching
import sys
import os
from pathlib import Path

# Add the parent directory to the Python path (in case this cell runs independently)
current_dir = Path.cwd()
if current_dir.name == 'notebooks':
    parent_dir = current_dir.parent
else:
    parent_dir = current_dir

if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

import json
from models.semantic_matcher import SemanticMatcher
from scoring.scorer import ResumeScorer
from scoring.rule_based import RuleBasedScorer

# === Load job description ===
with open(parent_dir / "data" / "job_description.json", "r") as f:
    job_data = json.load(f)

# === Initialize Scorers ===
rule_scorer = RuleBasedScorer()
matcher = SemanticMatcher()
scorer = ResumeScorer(suitable_threshold=0.7, maybe_threshold=0.5)


def score_resumes(resumes_list, job):
    results = []
    for resume in resumes_list:
        rule_score, breakdown, matched, missing = rule_scorer.score(resume, job)
        semantic_score = matcher.compute_composite_score(job, resume)
        final_score = round(0.6 * rule_score + 0.4 * semantic_score, 2)
        suitability = scorer.classify(final_score / 100)  # expects normalized 0–1
        results.append({
            "resume_id": resume.get("resume_id"),
            "name": resume.get("name") or resume.get("contact", {}).get("name"),
            "rule_score": rule_score,
            "semantic_score": round(semantic_score, 2),
            "final_score": final_score,
            "suitability": suitability,
            "breakdown": breakdown,
            "matched_skills": matched,
            "missing_skills": missing
        })
    results.sort(key=lambda x: x["final_score"], reverse=True)
    return results

# === Score Base parsed resumes ===
base_path = parent_dir / "data" / "parsed_resumes.json"
with open(base_path, "r") as f:
    base_data = json.load(f)
base_resumes = base_data.get("resumes", [])

scored_resumes = score_resumes(base_resumes, job_data)
base_out_path = parent_dir / "data" / "final_scored_resumes.json"
with open(base_out_path, "w") as f:
    json.dump({
        "job_title": job_data.get("title"),
        "total_candidates": len(scored_resumes),
        "scored_resumes": scored_resumes
    }, f, indent=2)
print(f"✅ Final scored resumes (Base) saved to {base_out_path}")
if scored_resumes:
    print("\nTop candidate (Base):")
    print(json.dumps(scored_resumes[0], indent=2))

# === If GPT parsed exists, score GPT as well ===
gpt_path = parent_dir / "data" / "parsed_resumes_gpt.json"
if gpt_path.exists():
    with open(gpt_path, "r") as f:
        gpt_data = json.load(f)
    gpt_resumes = gpt_data.get("resumes", [])
    scored_resumes_gpt = score_resumes(gpt_resumes, job_data)
    gpt_out_path = parent_dir / "data" / "final_scored_resumes_gpt.json"
    with open(gpt_out_path, "w") as f:
        json.dump({
            "job_title": job_data.get("title"),
            "total_candidates": len(scored_resumes_gpt),
            "scored_resumes": scored_resumes_gpt
        }, f, indent=2)
    print(f"✅ Final scored resumes (GPT) saved to {gpt_out_path}")
    if scored_resumes_gpt:
        print("\nTop candidate (GPT):")
        print(json.dumps(scored_resumes_gpt[0], indent=2))
else:
    print("ℹ️ No GPT parsed resumes found — skipping GPT scoring.")


  from .autonotebook import tqdm as notebook_tqdm
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.45it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.93it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.62it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 55.30it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.85it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 38.51it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 53.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.39it/s]
Batches: 100%|██████████| 1/1 [00:00

✅ Final scored resumes (Base) saved to c:\Users\wangk\OneDrive\Desktop\resume_matcher\data\final_scored_resumes.json

Top candidate (Base):
{
  "resume_id": "resume_v4.21",
  "name": "BOYANG LIU",
  "rule_score": 100,
  "semantic_score": 59.33,
  "final_score": 83.73,
  "suitability": "Suitable",
  "breakdown": {
    "skills_score": 100,
    "education_score": 0,
    "experience_score": 0,
    "projects_score": 0,
    "company_score": 0
  },
  "matched_skills": [
    "docker",
    "git",
    "java",
    "rest api",
    "sql",
    "aws",
    "rest api development",
    "python",
    "amazon web services",
    "rest"
  ],
  "missing_skills": [
    "api"
  ]
}


Batches: 100%|██████████| 1/1 [00:00<00:00, 41.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 59.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 55.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 56.14it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 94.40it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 56.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 34.83it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 38.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 59.91it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 51.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.90it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 55.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.88it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 69.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.45it/s]
Batches: 1

✅ Final scored resumes (GPT) saved to c:\Users\wangk\OneDrive\Desktop\resume_matcher\data\final_scored_resumes_gpt.json

Top candidate (GPT):
{
  "resume_id": "python-developer-resume-example",
  "name": "Giulia Gonzalez",
  "rule_score": 100,
  "semantic_score": 44.5,
  "final_score": 77.8,
  "suitability": "Suitable",
  "breakdown": {
    "skills_score": 100,
    "education_score": 0,
    "experience_score": 0,
    "projects_score": 0,
    "company_score": 0
  },
  "matched_skills": [
    "git",
    "java",
    "api",
    "rest api",
    "sql",
    "aws",
    "rest api development",
    "python",
    "amazon web services",
    "rest"
  ],
  "missing_skills": [
    "docker"
  ]
}





In [None]:
# Visualization: Top 10 candidates (Base)
import json
from pathlib import Path
import pandas as pd
import plotly.express as px

# Resolve project root
current_dir = Path.cwd()
parent_dir = current_dir.parent if current_dir.name == 'notebooks' else current_dir

base_out_path = parent_dir / "data" / "final_scored_resumes.json"
if not base_out_path.exists():
    print(f"Base results not found at {base_out_path}. Run the scoring cell first.")
else:
    with open(base_out_path, "r") as f:
        base_results = json.load(f)

    candidates = base_results.get("scored_resumes", [])
    topk = candidates[:10]

    # Build DataFrame for display
    def join_list(lst, max_items=10):
        if not isinstance(lst, list):
            return ""
        s = ", ".join(lst[:max_items])
        if len(lst) > max_items:
            s += ", …"
        return s

    rows = []
    for c in topk:
        rows.append({
            "resume_id": c.get("resume_id"),
            "name": c.get("name"),
            "final_score": c.get("final_score"),
            "rule_score": c.get("rule_score"),
            "semantic_score": c.get("semantic_score"),
            "suitability": c.get("suitability"),
            "matched_count": len(c.get("matched_skills", [])),
            "missing_count": len(c.get("missing_skills", [])),
            "matched_skills": join_list(c.get("matched_skills", []), max_items=20),
            "missing_skills": join_list(c.get("missing_skills", []), max_items=20),
        })

    df_topk_base = pd.DataFrame(rows)
    print("Top 10 candidates (Base):")
    display(df_topk_base)

    # Bar chart of Final Scores
    fig = px.bar(df_topk_base, x="name", y="final_score", color="suitability",
                 title="Top 10 Final Scores (Base)", text="final_score")
    fig.update_traces(texttemplate='%{text}', textposition='outside')
    fig.update_layout(yaxis_title='Final Score', xaxis_title='Candidate', uniformtext_minsize=8, uniformtext_mode='hide')
    fig.show()



Top 5 candidates (Base):


Unnamed: 0,resume_id,name,final_score,rule_score,semantic_score,suitability,matched_count,missing_count,matched_skills,missing_skills
0,resume_v4.21,BOYANG LIU,83.73,100,59.33,Suitable,10,1,"docker, git, java, rest api, sql, aws, rest ap...",api
1,Astrid_resume,Astrid Gao,82.0,100,55.0,Suitable,9,2,"git, java, rest api, sql, aws, rest api develo...","docker, api"
2,Lin Yang CV,Lin Yang,80.8,99,53.5,Suitable,9,2,"git, java, rest api, sql, aws, rest api develo...","docker, api"
3,python-developer-resume-example,GIULIA WORK EXPERIENCE,80.23,100,50.58,Suitable,9,2,"git, java, rest api, sql, aws, rest api develo...","docker, api"
4,java-software-engineer-resume-example,YVONNE GREEN WORK EXPERIENCE,79.45,100,48.62,Suitable,9,2,"docker, git, java, rest api, sql, aws, rest ap...","api, python"


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
# Visualization: Top 10 candidates (GPT)
import json
from pathlib import Path
import pandas as pd
import plotly.express as px

# Resolve project root
current_dir = Path.cwd()
parent_dir = current_dir.parent if current_dir.name == 'notebooks' else current_dir

gpt_out_path = parent_dir / "data" / "final_scored_resumes_gpt.json"
if not gpt_out_path.exists():
    print(f"GPT results not found at {gpt_out_path}. Run the GPT scoring or provide an API key.")
else:
    with open(gpt_out_path, "r") as f:
        gpt_results = json.load(f)

    candidates = gpt_results.get("scored_resumes", [])
    topk = candidates[:10]

    def join_list(lst, max_items=10):
        if not isinstance(lst, list):
            return ""
        s = ", ".join(lst[:max_items])
        if len(lst) > max_items:
            s += ", …"
        return s

    rows = []
    for c in topk:
        rows.append({
            "resume_id": c.get("resume_id"),
            "name": c.get("name"),
            "final_score": c.get("final_score"),
            "rule_score": c.get("rule_score"),
            "semantic_score": c.get("semantic_score"),
            "suitability": c.get("suitability"),
            "matched_count": len(c.get("matched_skills", [])),
            "missing_count": len(c.get("missing_skills", [])),
            "matched_skills": join_list(c.get("matched_skills", []), max_items=20),
            "missing_skills": join_list(c.get("missing_skills", []), max_items=20),
        })

    df_topk_gpt = pd.DataFrame(rows)
    print("Top 10 candidates (GPT):")
    display(df_topk_gpt)

    fig = px.bar(df_topk_gpt, x="name", y="final_score", color="suitability",
                 title="Top 10 Final Scores (GPT)", text="final_score")
    fig.update_traces(texttemplate='%{text}', textposition='outside')
    fig.update_layout(yaxis_title='Final Score', xaxis_title='Candidate', uniformtext_minsize=8, uniformtext_mode='hide')
    fig.show()


Top 5 candidates (GPT):


Unnamed: 0,resume_id,name,final_score,rule_score,semantic_score,suitability,matched_count,missing_count,matched_skills,missing_skills
0,python-developer-resume-example,Giulia Gonzalez,77.8,100,44.5,Suitable,10,1,"git, java, api, rest api, sql, aws, rest api d...",docker
1,Astrid_resume,Astrid Gao,67.19,81,46.47,Maybe Suitable,7,4,"git, java, sql, aws, rest api development, pyt...","rest api, docker, api, rest"
2,YaoyaoWang_Resume_NEU,Yaoyao (Renee) Wang,63.61,74,48.03,Maybe Suitable,6,5,"git, java, sql, rest api development, python, ...","docker, api, aws, rest api, rest"
3,Resume_Hao_Yang,Hao Yang,62.4,70,51.0,Maybe Suitable,7,4,"docker, git, java, sql, aws, python, amazon we...","rest api, api, rest api development, rest"
4,KW_Resume,Kun Wang,62.23,70,50.59,Maybe Suitable,7,4,"docker, git, java, sql, aws, python, amazon we...","rest api, api, rest api development, rest"


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [7]:
# Save Top 10 (Base) to CSV
# Run this cell to export the Top 10 base candidates table to CSV
from pathlib import Path
import json
import pandas as pd

current_dir = Path.cwd()
parent_dir = current_dir.parent if current_dir.name == 'notebooks' else current_dir
base_out_path = parent_dir / "data" / "final_scored_resumes.json"

if not base_out_path.exists():
    print(f"Base results not found at {base_out_path}. Run the scoring and visualization cells first.")
else:
    with open(base_out_path, "r") as f:
        base_results = json.load(f)
    candidates = base_results.get("scored_resumes", [])[:10]

    def join_list(lst, max_items=100):
        if not isinstance(lst, list):
            return ""
        return ", ".join(lst[:max_items])

    rows = []
    for c in candidates:
        rows.append({
            "resume_id": c.get("resume_id"),
            "name": c.get("name"),
            "final_score": c.get("final_score"),
            "rule_score": c.get("rule_score"),
            "semantic_score": c.get("semantic_score"),
            "suitability": c.get("suitability"),
            "matched_skills": join_list(c.get("matched_skills", [])),
            "missing_skills": join_list(c.get("missing_skills", [])),
        })

    df = pd.DataFrame(rows)
    csv_path = parent_dir / "data" / "top10_base_candidates.csv"
    df.to_csv(csv_path, index=False)
    print(f"✅ Saved Top 10 Base candidates to {csv_path}")


✅ Saved Top 10 Base candidates to c:\Users\wangk\OneDrive\Desktop\resume_matcher\data\top10_base_candidates.csv


In [8]:
# Save Top 10 (GPT) to CSV
# Run this cell to export the Top 10 GPT candidates table to CSV
from pathlib import Path
import json
import pandas as pd

current_dir = Path.cwd()
parent_dir = current_dir.parent if current_dir.name == 'notebooks' else current_dir
gpt_out_path = parent_dir / "data" / "final_scored_resumes_gpt.json"

if not gpt_out_path.exists():
    print(f"GPT results not found at {gpt_out_path}. Run the GPT scoring and visualization cells first.")
else:
    with open(gpt_out_path, "r") as f:
        gpt_results = json.load(f)
    candidates = gpt_results.get("scored_resumes", [])[:10]

    def join_list(lst, max_items=100):
        if not isinstance(lst, list):
            return ""
        return ", ".join(lst[:max_items])

    rows = []
    for c in candidates:
        rows.append({
            "resume_id": c.get("resume_id"),
            "name": c.get("name"),
            "final_score": c.get("final_score"),
            "rule_score": c.get("rule_score"),
            "semantic_score": c.get("semantic_score"),
            "suitability": c.get("suitability"),
            "matched_skills": join_list(c.get("matched_skills", [])),
            "missing_skills": join_list(c.get("missing_skills", [])),
        })

    df = pd.DataFrame(rows)
    csv_path = parent_dir / "data" / "top10_gpt_candidates.csv"
    df.to_csv(csv_path, index=False)
    print(f"✅ Saved Top 10 GPT candidates to {csv_path}")


✅ Saved Top 10 GPT candidates to c:\Users\wangk\OneDrive\Desktop\resume_matcher\data\top10_gpt_candidates.csv
