In [1]:
import sys
sys.path.append("..")

In [None]:
from job_agent.discover.companies import get_company_list
from job_agent.ingest.greenhouse import fetch_jobs
from job_agent.ingest.parser import parse_greenhouse_job
from job_agent.memory.store import JobStore
from job_agent.models.profile import UserProfile
from job_agent.rank.rules import passes_rules
from job_agent.rank.text import job_text, profile_query
from job_agent.rank.lexical import LexicalScorer
from job_agent.rank.semantic import SemanticScorer
from job_agent.enrich.fetch import fetch_job_page
from job_agent.enrich.extract import extract_job_text
from job_agent.memory.store import JobStore
from job_agent.generate.cover_letter import generate_cover_letter_local, generate_cover_letter_gpt
from job_agent.generate.local_llm import LocalLLM
from job_agent.generate.resume import parse_resume_pdf


import numpy as np
from dotenv import load_dotenv
from openai import OpenAI
import os
import sklearn
import torch
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


## Fetching and Storing new Jobs

In [4]:
store = JobStore("job_agent/data/jobs.db")

user_companies = ["airbnb", "spotify", "adobe"]  
companies = get_company_list(user_companies)

for company in companies:
    raw_jobs = fetch_jobs(company)
    if not raw_jobs:
        continue

    parsed_jobs = [parse_greenhouse_job(j, company) for j in raw_jobs]
    store.save_jobs(parsed_jobs)

[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/adobe/jobs
[ingest] no Greenhouse board found for 'adobe', skipping
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/airbnb/jobs
[ingest] 200 jobs found for 'airbnb'
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/databricks/jobs
[ingest] 666 jobs found for 'databricks'
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/google/jobs
[ingest] no Greenhouse board found for 'google', skipping
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/meta/jobs
[ingest] no Greenhouse board found for 'meta', skipping
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/spotify/jobs
[ingest] no Greenhouse board found for 'spotify', skipping
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/stripe/jobs
[ingest] 519 jobs found for 'stripe'
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/u

In [5]:
for job in parsed_jobs[:10]:
    print(job)
    print("-" * 60)

JobPosting(job_id='7374078', company='stripe', title='Account Executive, AI Sales', location='SF', description='', apply_url='https://stripe.com/jobs/search?gh_jid=7374078', posted_date=datetime.date(2025, 12, 16), source='greenhouse')
------------------------------------------------------------
JobPosting(job_id='7374073', company='stripe', title='Account Executive, AI Sales', location='SF', description='', apply_url='https://stripe.com/jobs/search?gh_jid=7374073', posted_date=datetime.date(2025, 12, 16), source='greenhouse')
------------------------------------------------------------
JobPosting(job_id='7403230', company='stripe', title='Account Executive, Benelux & Nordics - Existing Business', location='Dublin', description='', apply_url='https://stripe.com/jobs/search?gh_jid=7403230', posted_date=datetime.date(2025, 12, 16), source='greenhouse')
------------------------------------------------------------
JobPosting(job_id='7423968', company='stripe', title='Account Executive, Ben

In [6]:
new_count = store.save_jobs(parsed_jobs)
print(f"Inserted {new_count} new jobs")

all_jobs = store.load_all_jobs()
print(f"Total jobs in DB: {len(all_jobs)}")

Inserted 0 new jobs
Total jobs in DB: 1392


In [7]:
profile = UserProfile(
    target_roles=["ml engineer", "data scientist", "data engineer"],
    core_skills=[
        "machine learning", "deep learning",
        "evaluation", "pytorch", "statistics"
    ],
    location=["US"]
)

## Hard Filtering

In [8]:
import pandas as pd

df = pd.read_csv("../uscities.csv")
print(df.shape)

(31254, 17)


In [9]:
filtered_jobs = [
    job for job in all_jobs
    if passes_rules(job, profile)
]

print(f"Jobs before hard rules: {len(all_jobs)}")
print(f"Jobs after hard rules: {len(filtered_jobs)}")

Jobs before hard rules: 1392
Jobs after hard rules: 250


In [10]:
job_texts = [job_text(j) for j in filtered_jobs]
job_ids = [j.job_id for j in filtered_jobs]
query_text = profile_query(profile)

## Lexical and Semantic Scoring

In [11]:
lex = LexicalScorer()
lex.fit(job_texts, job_ids)
lex_scores = lex.score_query(query_text)

In [12]:
sem = SemanticScorer("sentence-transformers/all-MiniLM-L6-v2")
query_emb = sem.embed([query_text])[0]
job_embs = sem.embed(job_texts)

In [None]:
rows = []

for job, emb in zip(filtered_jobs, job_embs):
    s_sem = sem.similarity(query_emb, emb)
    s_lex = lex_scores.get(job.job_id, 0.0)
    
    rows.append({
        "job": job,
        "semantic": s_sem,
        "lexical": s_lex,
        
    })

In [18]:
top_semantic = sorted(
    rows,
    key=lambda r: r["semantic"],
    reverse=True
)[:10]

print("\n=== TOP 10 BY SEMANTIC SIMILARITY ===\n")

for r in top_semantic:
    job = r["job"]
    print(f"sem={r['semantic']:.3f}")
    print(f"{job.title} @ {job.company}")
    print("-" * 80)


=== TOP 10 BY SEMANTIC SIMILARITY ===

sem=0.407
Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb
--------------------------------------------------------------------------------
sem=0.401
Staff Machine Learning Engineer - Community Support Engineering @ airbnb
--------------------------------------------------------------------------------
sem=0.359
Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
--------------------------------------------------------------------------------
sem=0.353
Senior Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
--------------------------------------------------------------------------------
sem=0.333
Senior Software Engineer - Fullstack (NYC) @ databricks
--------------------------------------------------------------------------------
sem=0.328
Manager, Field Engineering (Pre-Sales) @ databricks
--------------------------------------------------------------------------------
sem=0.

In [19]:
top_lexical = sorted(
    rows,
    key=lambda r: r["lexical"],
    reverse=True
)[:10]

print("\n=== TOP 10 BY LEXICAL MATCH ===\n")

for r in top_lexical:
    job = r["job"]
    print(f"lex={r['lexical']:.3f}")
    print(f"{job.title} @ {job.company}")
    print("-" * 80)


=== TOP 10 BY LEXICAL MATCH ===

lex=0.422
Staff Machine Learning Engineer - Community Support Engineering @ airbnb
--------------------------------------------------------------------------------
lex=0.407
Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb
--------------------------------------------------------------------------------
lex=0.369
Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
--------------------------------------------------------------------------------
lex=0.356
Senior Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
--------------------------------------------------------------------------------
lex=0.339
Senior Machine Learning Engineer, Stripe Assistant @ stripe
--------------------------------------------------------------------------------
lex=0.268
Senior Data Scientist - Inference, Global Markets @ airbnb
--------------------------------------------------------------------------------
l

## Combining Scores
### APPROACH I - Reciprocal Rank Fusion

In [20]:
# RRF reciprocal rank fusion

def rrf_fusion(sem_rank, lex_rank, k=60):
    return 1/(k + sem_rank) + 1/(k + lex_rank)

# rank separately
sem_sorted = sorted(rows, key=lambda r: r["semantic"], reverse=True)
lex_sorted = sorted(rows, key=lambda r: r["lexical"], reverse=True)

sem_rank = {id(r): i+1 for i, r in enumerate(sem_sorted)}
lex_rank = {id(r): i+1 for i, r in enumerate(lex_sorted)}

rrf_scored = []
for r in rows:
    score = rrf_fusion(sem_rank[id(r)], lex_rank[id(r)])
    rrf_scored.append((score, r))

rrf_scored.sort(reverse=True, key=lambda x: x[0])

print("\n=== TOP 10: RRF FUSION ===")
for s, r in rrf_scored[:10]:
    j = r["job"]
    print(f"{s:.4f} | {j.title} @ {j.company}")


=== TOP 10: RRF FUSION ===
0.0325 | Staff Machine Learning Engineer - Community Support Engineering @ airbnb
0.0325 | Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb
0.0317 | Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
0.0312 | Senior Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
0.0282 | Staff Software Engineer, Data Warehouse Compute @ airbnb
0.0280 | Senior Machine Learning Engineer, Stripe Assistant @ stripe
0.0280 | Senior Solutions Engineer @ databricks
0.0279 | Staff Software Engineer, Experimentation Data @ airbnb
0.0279 | Director, Engineering - Databricks Mosaic AI @ databricks
0.0277 | Senior Software Engineer - Fullstack (NYC) @ databricks


### APPROACH II - Product of Experts

In [21]:
# Product of Experts (PoE) — “semantic with lexical a-priori”

sem_vals = np.array([r["semantic"] for r in rows])
lex_vals = np.array([r["lexical"] for r in rows])

mu_s, sd_s = sem_vals.mean(), sem_vals.std() + 1e-9
mu_l, sd_l = lex_vals.mean(), lex_vals.std() + 1e-9

def zsig(x, mu, sd):
    return 1 / (1 + np.exp(-(x - mu)/sd))

def poe_score(sem, lex, a=1.0, b=1.0, eps=1e-6):
    ps = zsig(sem, mu_s, sd_s)
    pl = zsig(lex, mu_l, sd_l)
    return a*np.log(ps + eps) + b*np.log(pl + eps)

poe_scored = []
for r in rows:
    s = poe_score(r["semantic"], r["lexical"], a=1.0, b=1.0)
    poe_scored.append((s, r))

poe_scored.sort(reverse=True, key=lambda x: x[0])

print("\n=== TOP 10: PRODUCT OF EXPERTS ===")
for s, r in poe_scored[:10]:
    j = r["job"]
    print(f"{s:.3f} | sem={r['semantic']:.3f} lex={r['lexical']:.3f} | {j.title}")


=== TOP 10: PRODUCT OF EXPERTS ===
-0.028 | sem=0.407 lex=0.407 | Staff Machine Learning Engineer, Listings and Host Tools Data and AI
-0.029 | sem=0.401 lex=0.422 | Staff Machine Learning Engineer - Community Support Engineering
-0.053 | sem=0.359 lex=0.369 | Staff Machine Learning Engineer, Communication & Connectivity
-0.058 | sem=0.353 lex=0.356 | Senior Staff Machine Learning Engineer, Communication & Connectivity
-0.215 | sem=0.239 lex=0.339 | Senior Machine Learning Engineer, Stripe Assistant
-0.276 | sem=0.245 lex=0.206 | Staff Software Engineer, Experimentation Data
-0.317 | sem=0.211 lex=0.268 | Senior Data Scientist - Inference, Global Markets
-0.343 | sem=0.308 lex=0.129 | Senior Solutions Engineer
-0.346 | sem=0.309 lex=0.128 | Director, Engineering - Databricks Mosaic AI
-0.347 | sem=0.248 lex=0.161 | Staff Software Engineer, Data Warehouse Compute


## Extracting Job Details

In [22]:
TOP_N = 10
enriched = []

for poe, r in poe_scored[:TOP_N]:
    job = r["job"]
    sem = r["semantic"]
    lex = r["lexical"]

    # 1. check cache
    full_text = store.load_job_detail(job.job_id)

    # 2. fetch + extract if missing
    if full_text is None:
        print(f"[enrich] fetching {job.job_id}")
        html = fetch_job_page(job.apply_url)
        full_text = extract_job_text(html)
        store.save_job_detail(job.job_id, full_text)

    enriched.append({
        "poe": poe,
        "semantic": sem,
        "lexical": lex,
        "job": job,
        "full_text": full_text,
    })


In [23]:
for e in enriched:
    j = e["job"]
    print(
        f"{e['poe']:+.3f} | "
        f"sem={e['semantic']:.3f} "
        f"lex={e['lexical']:.3f} | "
        f"{j.title} @ {j.company}"
    )

-0.028 | sem=0.407 lex=0.407 | Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb
-0.029 | sem=0.401 lex=0.422 | Staff Machine Learning Engineer - Community Support Engineering @ airbnb
-0.053 | sem=0.359 lex=0.369 | Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
-0.058 | sem=0.353 lex=0.356 | Senior Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
-0.215 | sem=0.239 lex=0.339 | Senior Machine Learning Engineer, Stripe Assistant @ stripe
-0.276 | sem=0.245 lex=0.206 | Staff Software Engineer, Experimentation Data @ airbnb
-0.317 | sem=0.211 lex=0.268 | Senior Data Scientist - Inference, Global Markets @ airbnb
-0.343 | sem=0.308 lex=0.129 | Senior Solutions Engineer @ databricks
-0.346 | sem=0.309 lex=0.128 | Director, Engineering - Databricks Mosaic AI @ databricks
-0.347 | sem=0.248 lex=0.161 | Staff Software Engineer, Data Warehouse Compute @ airbnb


## Generating Cover Letter 

In [24]:
resume_text = parse_resume_pdf("../resume.pdf")

print(len(resume_text))
print(resume_text[:800])

5253
Manush Kalwari
manushkalwari141@gmail.com | LinkedIn:manush-kalwari | GitHub:ManushKalwari | Portfolio Website
Technologies
Machine Learning & Generative AI: PyTorch, TensorFlow, HuggingFace, scikit-learn, LangChain, LangGraph, Optimizations (LoRA, GRPO,
quantization, FlashAttention), Docker, CI/CD (GitHub Actions, GitLab), Comet ML, Weights & Biases
Cloud: AWS (S3, SageMaker, Lambda, EC2), GCP (Vertex AI, BigQuery , Cloud Run, GKE, Composer, Dataflow)
High Performance ML: CUDA, Sharding, multi-GPU training, profiling, distributed training
Education
Columbia University, MS in Electrical Engineering (Specialization in ML) Sept 2024 – Dec 2025 (Exp.)
• Coursework: Advanced Deep Learning, Scaling LLM Systems, High-Performance ML, Generative AI, Mathematics of ML
PROJECTS
ScaleRAG - Multi-Moda


### APPROACH I - GPT API

In [25]:
load_dotenv("../keys.env")
cover_letters = []

for i, e in enumerate(enriched[:1], start=1):
    job = e["job"]
    job_text = e["full_text"]

    print(f"\n[generate {i}/10] {job.title} @ {job.company}")

    cl = generate_cover_letter_gpt(
        resume_text=resume_text,
        job_text=job_text,
        company=job.company,
        role=job.title,
    )

    cover_letters.append({
        "job_id": job.job_id,
        "company": job.company,
        "title": job.title,
        "poe": e["poe"],
        "cover_letter": cl,
    })


[generate 1/10] Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb


In [26]:
for c in cover_letters:
    print("=" * 100)
    print(f"{c['title']} @ {c['company']}  |  PoE={c['poe']:.3f}")
    print("-" * 100)
    print(c["cover_letter"])

Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb  |  PoE=-0.028
----------------------------------------------------------------------------------------------------
Dear Airbnb Hiring Team,

I am excited to apply for the Staff Machine Learning Engineer, Listings and Host Tools Data and AI role. As a Columbia University MS candidate specializing in ML, I have built and deployed end-to-end ML systems that span data pipelines, model development, and production-grade serving. My work focuses on delivering product-facing capabilities with strong observability, scalability, and close collaboration with cross-functional partners.

In ScaleRAG, I designed a hierarchical multimodal retrieval framework for large corpora, integrating components such as FAISS indexing, vLLM, and PagedAttention. I built an end-to-end evaluation and benchmarking pipeline (including OpenAI GPT-4.1, LangChain, and multiprocessing) and deployed a real-time interface via FastAPI, Next.js, an

### APPROACH II - Local LLM

In [27]:
my_llm = LocalLLM()
cover_letters2 = []

for i, e in enumerate(enriched[:1], start=1):
    job = e["job"]
    job_text = e["full_text"]

    print(f"\n[generate {i}/10] {job.title} @ {job.company}")

    cl = generate_cover_letter_local(
        resume_text=resume_text,
        job_text=job_text,
        company=job.company,
        role=job.title,
        llm=my_llm,
    )

    cover_letters2.append({
        "job_id": job.job_id,
        "company": job.company,
        "title": job.title,
        "poe": e["poe"],
        "cover_letter": cl,
    })


[generate 1/10] Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb


In [28]:
for c in cover_letters2:
    print("=" * 100)
    print(f"{c['title']} @ {c['company']}  |  PoE={c['poe']:.3f}")
    print("-" * 100)
    print(c["cover_letter"])

Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb  |  PoE=-0.028
----------------------------------------------------------------------------------------------------
 Diversity And Inclusion
At Airbnb, we believe that diversity makes us better. Our mission is to bring people together through travel and hospitality. We strive to create a workplace culture that reflects this belief and values inclusion, equity, and belonging. We encourage all candidates to apply regardless of race, ethnicity, gender identity, sexual orientation, religion, disability status, veteran status, or any other characteristic protected by law.
We are committed to providing equal employment opportunities to all individuals and do not discriminate based on race, color, national origin, age, sex, marital status, physical or mental disability, veteran status, gender identity, or sexual orientation. If you require reasonable accommodation during the application process due to a disability, 