In [None]:
import sys
sys.path.append("..")

In [None]:
from job_agent.ingest.greenhouse import fetch_jobs
from job_agent.ingest.parser import parse_greenhouse_job
from job_agent.memory.store import JobStore
from job_agent.models.profile import UserProfile
from job_agent.rank.rules import passes_rules
from job_agent.rank.signals import extract_signals
from job_agent.rank.scorer import combine_scores
from job_agent.rank.text import job_text, profile_query
from job_agent.rank.lexical import LexicalScorer
from job_agent.rank.semantic import SemanticScorer
from job_agent.rank.signals import extract_signals
from job_agent.rank.scorer import combine_scores
from job_agent.enrich.fetch import fetch_job_page
from job_agent.enrich.extract import extract_job_text
from job_agent.memory.store import JobStore
from job_agent.generate.cover_letter import generate_cover_letter
from job_agent.generate.local_llm import LocalLLM
from job_agent.generate.resume import parse_resume_pdf

import numpy as np
import sklearn
import torch
from sentence_transformers import SentenceTransformer

In [None]:
store = JobStore("job_agent/data/jobs.db")

company = "airbnb"   
raw_jobs = fetch_jobs(company)
parsed_jobs = [parse_greenhouse_job(j, company) for j in raw_jobs]

[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/airbnb/jobs
[ingest] response received
[ingest] 203 jobs found


In [None]:
for job in parsed_jobs[:100]:
    print(job)
    print("-" * 60)

JobPosting(job_id='7467432', company='airbnb', title='Acquisition Lead, Experiences, Mexico City (12 month contract)', location='Mexico', description='', apply_url='https://careers.airbnb.com/positions/7467432?gh_jid=7467432', posted_date=datetime.date(2025, 12, 15), source='greenhouse')
------------------------------------------------------------
JobPosting(job_id='7467455', company='airbnb', title='Acquisition Manager, Experiences, Mexico City (12 month contract)', location='Mexico', description='', apply_url='https://careers.airbnb.com/positions/7467455?gh_jid=7467455', posted_date=datetime.date(2025, 12, 15), source='greenhouse')
------------------------------------------------------------
JobPosting(job_id='7403612', company='airbnb', title='AirCover UX Enablement Manager', location='Canada', description='', apply_url='https://careers.airbnb.com/positions/7403612?gh_jid=7403612', posted_date=datetime.date(2025, 12, 19), source='greenhouse')
----------------------------------------

In [None]:
new_count = store.save_jobs(parsed_jobs)
print(f"Inserted {new_count} new jobs")

all_jobs = store.load_all_jobs()
print(f"Total jobs in DB: {len(all_jobs)}")

Inserted 0 new jobs
Total jobs in DB: 204


In [None]:
profile = UserProfile(
    target_roles=["ml_engineer", "applied_scientist", "Senior Data Scientist - Inference, Global Markets"],
    core_skills=[
        "machine learning", "modeling", "training",
        "evaluation", "pytorch", "statistics"
    ],
    exclude_keywords=["front end"]
)

In [None]:
job_texts = [job_text(j) for j in all_jobs]
job_ids = [j.job_id for j in all_jobs]

query_text = profile_query(profile)
query_text

'Target roles: ml_engineer, applied_scientist, Senior Data Scientist - Inference, Global Markets. Core skills: machine learning, modeling, training, evaluation, pytorch, statistics.'

## Hard Filtering >> Lexical and Semantic Scoring

In [None]:
filtered_jobs = [
    job for job in all_jobs
    if passes_rules(job, profile)
]

print(f"Jobs before hard rules: {len(all_jobs)}")
print(f"Jobs after hard rules: {len(filtered_jobs)}")

Jobs before hard rules: 204
Jobs after hard rules: 204


In [None]:
lex = LexicalScorer()
lex.fit(job_texts, job_ids)
lex_scores = lex.score_query(query_text)

In [None]:
sem = SemanticScorer("sentence-transformers/all-MiniLM-L6-v2")
query_emb = sem.embed([query_text])[0]
job_embs = sem.embed(job_texts)

In [None]:
ranked = []

for job, emb in zip(filtered_jobs, job_embs):
    s_sem = sem.similarity(query_emb, emb)
    s_lex = lex_scores.get(job.job_id, 0.0)
    sigs = extract_signals(job, profile)

    score = combine_scores(
        semantic=s_sem,
        lexical=s_lex,
        signals=sigs,
        penalty=0
    )

    ranked.append((score["final"], score, job))

ranked.sort(reverse=True, key=lambda x: x[0])

In [None]:
for final, breakdown, job in ranked[:10]:
    print(
        f"{final:.3f} | "
        f"sem={breakdown['semantic']:.3f} "
        f"lex={breakdown['lexical']:.3f} "
        f"skills={breakdown['skill_overlap']:.2f} "
        f"depth={breakdown['ml_depth']:.2f} "
        f"own={breakdown['ownership']:.2f} "
        f"pen={breakdown['penalty']:.2f}"
    )
    print(f"{job.title} @ {job.company}")
    print(job.apply_url)
    print("-" * 80)


0.372 | sem=0.337 lex=0.624 skills=0.00 depth=0.00 own=0.00 pen=0.00
Senior Data Scientist - Inference, Global Markets @ airbnb
https://careers.airbnb.com/positions/7446449?gh_jid=7446449
--------------------------------------------------------------------------------
0.340 | sem=0.470 lex=0.216 skills=0.17 depth=0.00 own=0.00 pen=0.00
Senior Staff Machine Learning Engineer - AI Safety and Guardrail @ airbnb
https://careers.airbnb.com/positions/7159569?gh_jid=7159569
--------------------------------------------------------------------------------
0.337 | sem=0.440 lex=0.262 skills=0.17 depth=0.00 own=0.00 pen=0.00
Senior Staff Machine Learning Engineer, Community Support Engineering @ airbnb
https://careers.airbnb.com/positions/6757302?gh_jid=6757302
--------------------------------------------------------------------------------
0.337 | sem=0.457 lex=0.230 skills=0.17 depth=0.00 own=0.00 pen=0.00
Staff Machine Learning Engineer - Community Support Engineering @ airbnb
https://careers.

## Extracting Details of filtered Jobs

In [None]:
TOP_N = 5
enriched = []

for final, breakdown, job in ranked[:TOP_N]:

    # 1. check cache
    full_text = store.load_job_detail(job.job_id)

    # 2. fetch + extract if missing
    if full_text is None:
        print(f"[enrich] fetching {job.job_id}")
        html = fetch_job_page(job.apply_url)
        full_text = extract_job_text(html)
        store.save_job_detail(job.job_id, full_text)

    enriched.append((final, breakdown, job, full_text))


In [None]:
for final, breakdown, job, text in enriched:
    print(
        f"{final:.3f} | "
        f"sem={breakdown['semantic']:.3f} "
        f"lex={breakdown['lexical']:.3f} "
        f"skills={breakdown['skill_overlap']:.2f} "
        f"depth={breakdown['ml_depth']:.2f} "
        f"own={breakdown['ownership']:.2f} "
        f"pen={breakdown['penalty']:.2f}"
    )
    print(f"{job.title} @ {job.company}")
    print(job.apply_url)
    print("-"*50)
    print(f"[enriched text length] {len(text)} chars")
    print("-"*50)
    print(text)
    #print(text[:1000])   # preview only
    print("-" * 100)


0.372 | sem=0.337 lex=0.624 skills=0.00 depth=0.00 own=0.00 pen=0.00
Senior Data Scientist - Inference, Global Markets @ airbnb
https://careers.airbnb.com/positions/7446449?gh_jid=7446449
--------------------------------------------------
[enriched text length] 5120 chars
--------------------------------------------------
Skip to main content
Careers
Life at Airbnb
Job Search
Connect Engineering Apprenticeship
Internship Programs
Contractor Roles
Life at Airbnb
Job Search
Connect Engineering Apprenticeship
Internship Programs
Contractor Roles
Senior Data Scientist - Inference, Global Markets
China
Role overview
Application
Airbnb was born in 2007 when two hosts welcomed three guests to their San Francisco home, and has since grown to over 5 million hosts who have welcomed over 2 billion guest arrivals in almost every country across the globe. Every day, hosts offer unique stays and experiences that make it possible for guests to connect with communities in a more authentic way.
The Com

## Generate Cover Letter 

In [None]:

resume_text = parse_resume_pdf("../resume.pdf")

print(len(resume_text))
print(resume_text[:800])

5253
Manush Kalwari
manushkalwari141@gmail.com | LinkedIn:manush-kalwari | GitHub:ManushKalwari | Portfolio Website
Technologies
Machine Learning & Generative AI: PyTorch, TensorFlow, HuggingFace, scikit-learn, LangChain, LangGraph, Optimizations (LoRA, GRPO,
quantization, FlashAttention), Docker, CI/CD (GitHub Actions, GitLab), Comet ML, Weights & Biases
Cloud: AWS (S3, SageMaker, Lambda, EC2), GCP (Vertex AI, BigQuery , Cloud Run, GKE, Composer, Dataflow)
High Performance ML: CUDA, Sharding, multi-GPU training, profiling, distributed training
Education
Columbia University, MS in Electrical Engineering (Specialization in ML) Sept 2024 – Dec 2025 (Exp.)
• Coursework: Advanced Deep Learning, Scaling LLM Systems, High-Performance ML, Generative AI, Mathematics of ML
PROJECTS
ScaleRAG - Multi-Moda


In [None]:
my_llm = LocalLLM()

cover_letter = generate_cover_letter(
    resume_text=resume_text,
    job_text=job_text,
    company=job.company,
    role=job.title,
    llm_call_fn=my_llm,  
)

print(cover_letter)

OSError: Qwen/Qwen1.5-1.8B-Instruct is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`