# Company Q&A Matching Notebook

This notebook loads the preprocessed **two-column data** (ID, `text`) for **company profiles** and **job postings**, splits each text **into paragraph-level chunks**, performs **bi-encoder retrieval** with `jinaai/jina-embeddings-v3` for a set of questions, and **re-ranks** the top matches with a **cross-encoder**.

**Output:** a **DataFrame with one row per company** (company ID + answers per question) that is saved to disk.

## Configuration

In [None]:
from pathlib import Path

# Paths to the prepared datasets (two columns: ID + 'text')
COMPANIES_PARQUET = Path("companies_two_columns.parquet")
COMPANIES_CSV     = Path("companies_two_columns.csv")

JOBS_PARQUET      = Path("jobs_two_columns.parquet")
JOBS_CSV          = Path("jobs_two_columns.csv")

# Column names
COMPANY_ID_COL    = "prsId"    # adjust if different
COMPANY_TEXT_COL  = "text"     # expected to be 'text'

JOB_ID_COL        = "jobId"    # informative; not required in final output
JOB_TEXT_COL      = "text"
JOB_COMPANY_ID_COL= "prsId"    # foreign key Job -> Company (adjust if needed)

# Questions: either via OCR from an image OR manually defined
QUESTIONS_IMAGE_PATH = None  # e.g., Path("questions.png") or None
QUESTIONS_MANUAL = [
    # Put your questions here (order matters) if no image is used.
    "Which technologies does the company use?",
    "Does the company offer remote work?",
    "What benefits are mentioned in job postings?",
]

# Retrieval / Ranking params
BI_ENCODER_MODEL      = "jinaai/jina-embeddings-v3"
CROSS_ENCODER_MODEL   = "cross-encoder/ms-marco-MiniLM-L-12-v2"
TOP_K_BI_PER_COMPANY  = 10    # how many candidates per question/company to pass to the cross-encoder
MAX_PARAGRAPH_LEN     = 20000 # hard cap for a single paragraph (characters)
MIN_PARAGRAPH_CHARS   = 20    # filter out very short paragraphs
BATCH_SIZE_EMB        = 64
BATCH_SIZE_CROSS      = 64

# Output
OUTPUT_CSV     = Path("company_answers.csv")
OUTPUT_PARQUET = Path("company_answers.parquet")

##  Load Data

In [None]:
import pandas as pd
from pathlib import Path

def _load_two_col(path_parquet: Path, path_csv: Path, id_col: str, text_col: str) -> pd.DataFrame:
    if path_parquet.exists():
        df = pd.read_parquet(path_parquet)
    elif path_csv.exists():
        df = pd.read_csv(path_csv)
    else:
        raise FileNotFoundError(f"Neither {path_parquet} nor {path_csv} was found.")
    # Standardize
    if id_col not in df.columns or text_col not in df.columns:
        raise KeyError(f"Expected columns not found: id_col={id_col}, text_col={text_col}. Got: {df.columns.tolist()}")
    df = df[[id_col, text_col]].dropna().drop_duplicates()
    return df

companies = _load_two_col(COMPANIES_PARQUET, COMPANIES_CSV, COMPANY_ID_COL, COMPANY_TEXT_COL)
jobs      = _load_two_col(JOBS_PARQUET, JOBS_CSV, JOB_ID_COL, JOB_TEXT_COL)

# Ensure we have a join key from jobs -> company
if JOB_COMPANY_ID_COL not in jobs.columns:
    for cand in ["prsId", "companyId", "company_id", "employerId"]:
        if cand in jobs.columns:
            JOB_COMPANY_ID_COL = cand
            break
    else:
        raise KeyError(f"Join column for Jobs->Company not found. Please set JOB_COMPANY_ID_COL. Available: {jobs.columns.tolist()}")

display(companies.head())
display(jobs.head())

## Chunking

In [None]:
import re
from typing import List

PARA_SPLIT_RE = re.compile(r"\n\s*\n+")

def split_into_paragraphs(text: str) -> List[str]:
    if not isinstance(text, str):
        return []
    paras = [p.strip() for p in PARA_SPLIT_RE.split(text) if p.strip()]
    # Fallback if no double newlines: split by single newlines
    if len(paras) <= 1 and "\n" in text:
        paras = [p.strip() for p in text.split("\n") if p.strip()]
    # Filter & truncate
    clean = []
    for p in paras:
        if len(p) >= MIN_PARAGRAPH_CHARS:
            if len(p) > MAX_PARAGRAPH_LEN:
                p = p[:MAX_PARAGRAPH_LEN]
            clean.append(p)
    return clean

## Build Company Corpus (profiles + job postings)

In [None]:
from collections import defaultdict

# Map: company_id -> list of (chunk_text, source_type, source_id)
company_corpus = defaultdict(list)

# Company profiles
for _, row in companies.iterrows():
    cid = row[COMPANY_ID_COL]
    for para in split_into_paragraphs(row[COMPANY_TEXT_COL]):
        company_corpus[cid].append((para, "company_profile", cid))

# Job postings
if JOB_COMPANY_ID_COL not in jobs.columns:
    raise KeyError(f"Join column {JOB_COMPANY_ID_COL} missing in jobs.")
for _, row in jobs.iterrows():
    cid = row[JOB_COMPANY_ID_COL]
    for para in split_into_paragraphs(row[JOB_TEXT_COL]):
        company_corpus[cid].append((para, "job_posting", row[JOB_ID_COL]))

num_companies = len(company_corpus)
num_chunks = sum(len(v) for v in company_corpus.values())
print(f"Companies in corpus: {num_companies} | Total chunks: {num_chunks}")

## Load Quastions

In [None]:
from typing import List
import re

def read_questions_from_image(img_path: Path) -> List[str]:
    import pytesseract
    from PIL import Image
    text = pytesseract.image_to_string(Image.open(img_path))
    # naive split by line breaks
    qs = [q.strip("-•:\t") for q in re.split(r"\n+|\r+", text) if q.strip()]
    # heuristic: remove very short fragments
    qs = [q for q in qs if len(q) > 5 and (q.endswith("?") or q.lower().startswith(
        ("what", "which", "how", "does", "is", "are", "has", "have", "when", "where", "why")))]
    return qs

if QUESTIONS_IMAGE_PATH:
    QUESTIONS = read_questions_from_image(Path(QUESTIONS_IMAGE_PATH))
else:
    QUESTIONS = [q.strip() for q in QUESTIONS_MANUAL if q and isinstance(q, str)]

print("Questions:", QUESTIONS)

## Matching

### Bi-Encoder Retrieval

In [None]:
from sentence_transformers import SentenceTransformer
import torch
from tqdm.auto import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
bi_encoder = SentenceTransformer(BI_ENCODER_MODEL, device=device, trust_remote_code=True)

# Prepare global list of all chunks + mapping
all_chunks = []
chunk_company_idx = []  # (company_id, idx_in_company)
for cid, chunks in company_corpus.items():
    for i, (txt, src, sid) in enumerate(chunks):
        all_chunks.append(txt)
        chunk_company_idx.append((cid, i))

# Encode all chunks
chunk_emb = bi_encoder.encode(
    all_chunks,
    batch_size=BATCH_SIZE_EMB,
    convert_to_tensor=True,
    normalize_embeddings=True,
    show_progress_bar=True
)

# Encode questions
question_emb = bi_encoder.encode(
    QUESTIONS,
    batch_size=BATCH_SIZE_EMB,
    convert_to_tensor=True,
    normalize_embeddings=True,
    show_progress_bar=False
)

# Map: company -> indices in global embedding array
company_to_global_indices = {}
offset = 0
for cid, chunks in company_corpus.items():
    n = len(chunks)
    company_to_global_indices[cid] = list(range(offset, offset+n))
    offset += n

print(f"Embeddings ready: chunks={chunk_emb.shape}, questions={question_emb.shape}, device={device}")

### Cross Encoder Reranking

In [None]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder(CROSS_ENCODER_MODEL, device=device)

def answer_for_company(cid, q_idx, top_k=TOP_K_BI_PER_COMPANY):
    # Candidates via Bi-Encoder cosine similarity within the company
    gidx = company_to_global_indices.get(cid, [])
    if not gidx:
        return None, None, None

    q_vec = question_emb[q_idx].unsqueeze(0)  # (1, d)
    cand_emb = chunk_emb[gidx]                # (m, d)
    scores = torch.matmul(cand_emb, q_vec.T).squeeze(1)  # (m,)
    topv, topi = torch.topk(scores, k=min(top_k, len(gidx)))
    top_global_indices = [gidx[int(i)] for i in topi]

    # Cross-Encoder reranking
    pairs = [(QUESTIONS[q_idx], all_chunks[idx]) for idx in top_global_indices]
    rerank_scores = cross_encoder.predict(pairs, batch_size=BATCH_SIZE_CROSS, convert_to_tensor=True)
    best_idx = int(torch.argmax(torch.tensor(rerank_scores)))
    best_global_idx = top_global_indices[best_idx]
    best_text = all_chunks[best_global_idx]
    best_score = float(rerank_scores[best_idx])
    return best_text, best_score, best_global_idx

# Example test (optional)
# sample_cid = next(iter(company_corpus.keys()))
# print(answer_for_company(sample_cid, 0))

##  Build Answers per Company & Save

In [None]:
import pandas as pd
from tqdm.auto import tqdm

# If you specifically want only 5 candidates re-ranked, set:
# TOP_K_BI_PER_COMPANY = 5  # (override the config above if needed)

answer_cols = [f"answer_q{i+1}" for i in range(len(QUESTIONS))]
# (Optional) keep provenance for debugging/auditing
prov_cols = [f"prov_q{i+1}" for i in range(len(QUESTIONS))]

rows = []
for cid in tqdm(company_corpus.keys(), desc="Building answers"):
    answers = []
    provs = []
    for q_idx in range(len(QUESTIONS)):
        best_text, best_score, best_global_idx = answer_for_company(cid, q_idx)
        answers.append(best_text if best_text is not None else "")
        provs.append(best_score if best_score is not None else None)
    row = {COMPANY_ID_COL: cid}
    row.update({col: ans for col, ans in zip(answer_cols, answers)})
    # Add provenance as separate columns (optional — comment out if not needed)
    row.update({col: pr for col, pr in zip(prov_cols, provs)})
    rows.append(row)

df_answers = pd.DataFrame(rows, columns=[COMPANY_ID_COL] + answer_cols + prov_cols)
display(df_answers.head())

# Save
df_answers.to_csv(OUTPUT_CSV, index=False)
try:
    df_answers.to_parquet(OUTPUT_PARQUET, index=False)
except Exception as e:
    print("Skipped Parquet save:", e)

print(f"Saved to: {OUTPUT_CSV.resolve()}")
try:
    print(f"Saved to: {OUTPUT_PARQUET.resolve()}")
except:
    pass