#### Google API Key Setup

In [1]:
from utils import get_openai_api_key
OPENAI_API_KEY = get_openai_api_key()
#print(OPENAI_API_KEY)
llm_config = {"model": "gemini-2.5-flash","api_key":"AIzaSyA3HwxdHFDr_tcbC3wKpApMYPpvUxuMTOQ","api_type":"google"}

#### Import Library

In [2]:
import os, re, json, pickle, traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from PyPDF2 import PdfReader
from docx import Document


RESUME_FOLDER = "Resumes"          # Resume folder
CACHE_FILE = "skills_cache.pkl"    # cache file
MAX_WORKERS = 3                    

#### Functions to extract skills

In [3]:
# Cell B: helpers (prefilter, regex-first, IO) + cache
def read_pdf(path):
    r = PdfReader(path)
    txt = ""
    for p in r.pages:
        txt += (p.extract_text() or "") + "\n"
    return txt.strip()

def read_docx(path):
    doc = Document(path)
    return "\n".join(p.text for p in doc.paragraphs).strip()

def prefilter_text(text, chars=1500):
    """
    Keep only lines that likely contain skills/experience.
    This drastically reduces tokens sent to the LLM.
    """
    lines = text.splitlines()
    candidates = []
    for i, line in enumerate(lines):
        if re.search(r"(?i)\b(skills|experience|technolog|projects|expertise)\b", line):
            start = max(0, i-1)
            end = min(len(lines), i+2)
            candidates.append(" ".join(lines[start:end]))
    if candidates:
        return "\n".join(candidates)[:chars]
    return text[:chars]  # fallback: short prefix

# small local skill vocabulary for regex-first extraction
KNOWN_SKILLS = {
    "python","tensorflow","pytorch","docker","sql","flask","fastapi","pandas","numpy",
    "scikit-learn","keras","aws","gcp","azure","git","powerbi","tableau","excel"
}

def regex_extract_skills(text):
    # tokenise and match to known skills set
    tokens = re.findall(r"[A-Za-z0-9\+\#\.\-]{2,}", text)
    found = sorted({t.lower() for t in tokens if t.lower() in KNOWN_SKILLS})
    return found

# cache load/save
if Path(CACHE_FILE).exists():
    with open(CACHE_FILE, "rb") as f:
        CACHE = pickle.load(f)
else:
    CACHE = {}

def save_cache():
    with open(CACHE_FILE, "wb") as f:
        pickle.dump(CACHE, f)


#### Define Job Description agent and extract JD skills

In [4]:
JD_TEXT = """
We are hiring a Machine Learning Engineer with experience in Python, TensorFlow or PyTorch, SQL,
Docker, and experience deploying models to production. Bonus: experience with Flask or FastAPI.
"""

# reuse cache if present
if "JD_SKILLS" in CACHE:
    jd_skills = CACHE["JD_SKILLS"]
else:
    # prefilter JD (small)
    jd_snip = prefilter_text(JD_TEXT, chars=1200)
    # call job_description_extractor_agent once
    try:
        jd_reply = job_description_extractor_agent.initiate_chat(
            recipient=job_description_extractor_agent,
            message=f"Extract skills and return JSON: {{'skills':[]}}\n\n{jd_snip}",
            max_turns=1
        )
        # autogen returns reply.chat_history; safe extraction helper:
        raw = getattr(jd_reply, "all_messages", None)
        if raw:
            # find last non-empty text
            txt = ""
            for m in reversed(raw):
                t = getattr(m, "text", None) or getattr(m, "content", None) or str(m)
                if t and t.strip():
                    txt = t.strip(); break
        else:
            txt = str(jd_reply)
        txt = txt.replace("```json","").replace("```","").strip()
        try:
            parsed = json.loads(txt)
            if isinstance(parsed, dict) and "skills" in parsed:
                jd_skills = [s.strip() for s in parsed["skills"]]
            elif isinstance(parsed, list):
                jd_skills = [s.strip() for s in parsed]
            else:
                jd_skills = regex_extract_skills(jd_snip)
        except:
            jd_skills = regex_extract_skills(jd_snip)
    except Exception as e:
        print("JD extraction error:", e)
        jd_skills = regex_extract_skills(jd_snip)

    CACHE["JD_SKILLS"] = jd_skills
    save_cache()

print("JD skills:", jd_skills)


JD skills: ['docker', 'flask', 'python', 'pytorch', 'sql', 'tensorflow']


#### Define Resume Extractor agent

In [5]:
# Cell D: wrapper to call resume agent once (strict prompt + parse JSON)
def llm_extract_resume_skills(snippet):
    try:
        prompt = f"Extract skills ONLY and return JSON: {{'skills':[]}}\n\n{snippet}"
        reply = resume_skill_extractor_agent.initiate_chat(
            recipient=resume_skill_extractor_agent,
            message=prompt,
            max_turns=1
        )
        raw_msgs = getattr(reply, "all_messages", None)
        if raw_msgs:
            txt = ""
            for m in reversed(raw_msgs):
                t = getattr(m, "text", None) or getattr(m, "content", None) or str(m)
                if t and t.strip():
                    txt = t.strip(); break
        else:
            txt = str(reply)
        txt = txt.replace("```json","").replace("```","").strip()
        parsed = json.loads(txt)
        if isinstance(parsed, dict) and "skills" in parsed:
            return [s.strip() for s in parsed["skills"]]
        if isinstance(parsed, list):
            return [s.strip() for s in parsed]
    except Exception as e:
        # fallback token extraction
        try:
            tokens = re.findall(r"[A-Za-z0-9\+\#\.\-]{2,}", txt)
            seen=[]
            for t in tokens:
                if t.lower() not in [x.lower() for x in seen]:
                    seen.append(t)
            return seen[:50]
        except:
            return []
    return []


#### Extract skills from resume

In [6]:
# Cell E: process one resume (used by threadpool)
def process_resume(path):
    path = str(path)
    if path in CACHE:
        return path, CACHE[path]
    # read file
    try:
        if path.lower().endswith(".pdf"):
            text = read_pdf(path)
        else:
            text = read_docx(path)
    except Exception as e:
        print(f"Error reading {path}: {e}")
        return path, []

    snippet = prefilter_text(text, chars=1400)

    # 1) regex-first
    skills = regex_extract_skills(snippet)
    # 2) if none found, do one LLM call
    if not skills:
        skills = llm_extract_resume_skills(snippet)

    # normalize & dedupe
    skills = [s.strip() for s in skills if s and len(s.strip())>1]
    # keep stable order, remove duplicates
    seen = []
    for s in skills:
        if s.lower() not in [x.lower() for x in seen]:
            seen.append(s)
    skills = seen

    CACHE[path] = skills
    save_cache()
    return path, skills


#### parallel resume process using Thread pool executor

In [7]:
# Cell F: parallel extraction over folder
files = [os.path.join(RESUME_FOLDER, f) for f in os.listdir(RESUME_FOLDER)
         if f.lower().endswith((".pdf", ".docx"))]

extracted = []  # list of dicts {path, skills}
if not files:
    print("No resume files found in", RESUME_FOLDER)
else:
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
        futures = {pool.submit(process_resume, p): p for p in files}
        for fut in as_completed(futures):
            try:
                p, skills = fut.result()
            except Exception as e:
                p = futures[fut]; skills = []
                print(f"Error processing {p}: {e}\n{traceback.format_exc()}")
            extracted.append({"path": p, "skills": skills})

# show
for r in extracted:
    print(os.path.basename(r["path"]), "→", r["skills"])


Lakshmi_resume.pdf → ['docker', 'flask', 'pandas', 'python', 'sql', 'tensorflow']
PriyaRaj_resume.docx → ['excel', 'powerbi', 'python', 'sql']
John_resume.pdf → ['aws', 'docker', 'fastapi', 'python', 'pytorch', 'tensorflow']


#### Disply Result

In [13]:
# Cell G: compare locally (deterministic) and (optionally) verify via comparator agent
def normalize_token(s): return re.sub(r"[^a-z0-9]", "", s.lower())

jd_norm = set(normalize_token(s) for s in jd_skills)

final = []
for r in extracted:
    res_skills = r["skills"]
    res_norm = set(normalize_token(s) for s in res_skills)
    matched_norm = res_norm & jd_norm
    matched = [s for s in res_skills if normalize_token(s) in matched_norm]
    score = round(len(matched)/len(jd_skills)*100, 1) if jd_skills else 0.0

    # optional: ask comparator agent to return a JSON verification 
    # comparator_reply = comparator.initiate_chat(recipient=comparator, message=f'Compare resume_skills: {res_skills} and jd_skills: {jd_skills} and return JSON', max_turns=1)

    final.append({"file": os.path.basename(r["path"]), "skills": res_skills, "matched": matched, "score": score})

# sort & print
final = sorted(final, key=lambda x: x["score"], reverse=True)
print("\nSUMMARY")
print("---------")
for row in final:
    print(f"{row['file']}: {row['score']}% | matched: {row['matched']} | skills: {row['skills']}")



SUMMARY
---------
Lakshmi_resume.pdf: 83.3% | matched: ['docker', 'flask', 'python', 'sql', 'tensorflow'] | skills: ['docker', 'flask', 'pandas', 'python', 'sql', 'tensorflow']
John_resume.pdf: 66.7% | matched: ['docker', 'python', 'pytorch', 'tensorflow'] | skills: ['aws', 'docker', 'fastapi', 'python', 'pytorch', 'tensorflow']
PriyaRaj_resume.docx: 33.3% | matched: ['python', 'sql'] | skills: ['excel', 'powerbi', 'python', 'sql']


#### Stores best resume in google sheet

In [9]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
# configure
scope = ['https://www.googleapis.com/auth/spreadsheets']
creds = ServiceAccountCredentials.from_json_keyfile_name('resumescreeningagent-2d0936d41fab.json', scope)
client = gspread.authorize(creds)
sheet = client.open_by_key('14PytqwB8Wl29LFyUmZkn2iL6DujgXPpYvLOPmEKVbyw').sheet1

best = max(final, key=lambda x: x['score'])
sheet.append_row([best['file'], best['score'], ", ".join(best['matched'])])
print("Saved best to Google Sheet:", best['file'])
  

Saved best to Google Sheet: Lakshmi_resume.pdf


#### OPTIMIZATION SUMMARY 

##### * Prefiltered resumes to send only skill-relevant text — saves tokens
##### * Used regex-based quick skill extraction before calling the LLM
##### * Structured all prompts for short JSON-only outputs (no long chats)
##### * Lowered temperature = 0 for consistent, minimal responses
##### * Parallelized resume screening using worker processes for speed
##### * Reduced repetitive agent calls — one-shot skill extraction per file
##### * Clean text normalization before comparison (lowercase + strip)
##### * Final results summarized and best match stored efficiently in Google Sheet

##### Outcome: Faster, cheaper, and smarter hierarchical resume screening pipeline!