In [None]:
!pip -q install -U sentence-transformers faiss-cpu rapidfuzz


In [None]:
# =========================================
# 1) 기본 설정/유틸
# =========================================
import os, re, glob, json, ast, uuid, hashlib, shelve
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import numpy as np, faiss, pandas as pd
from sentence_transformers import SentenceTransformer
from rapidfuzz import fuzz
from tqdm.auto import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"  # 토크나이저 과도 병렬 방지(속도/메모리 안정)

DATA_ROOT = "/content/data"
OUT_DIR   = "/content/artifacts"
os.makedirs(OUT_DIR, exist_ok=True)

EMB_MODEL = "intfloat/multilingual-e5-small"
EMB_BATCH = 64  # CPU면 32~64, GPU면 64~128

def md5(s: str) -> str:
    return hashlib.md5(s.encode("utf-8")).hexdigest()

@dataclass
class Doc:
    id: str
    corpus: str   # resume|question|jd|values
    text: str
    meta: Dict[str, Any]

def read_text(p):
    with open(p, "r", encoding="utf-8") as f: return f.read()

def literal_dict_from_txt(p):
    return ast.literal_eval(read_text(p))

def sent_simplify(s):
    return re.sub(r"\s+", " ", s).strip()

def split_bullets(text: str) -> List[str]:
    parts = re.split(r"[•\-\u2022\n]+|,\s", text)
    return [sent_simplify(p) for p in parts if sent_simplify(p)]

def find_values_path(root: str) -> Optional[str]:
    c=[]
    for p in glob.glob(os.path.join(root, "*")):
        b=os.path.basename(p)
        if b.endswith(".txt") and ("인재상" in b or "인재상" in b): c.append(p)
    return sorted(set(c), key=len)[0] if c else None


In [None]:
# =========================================
# 2) 로더들 (자소서는 *_masked.txt 원문에서 유닛화)
# =========================================
SECTION_PATTERNS = [
    ("지원동기", r"(지원동기)(.*?)(?=\n\s*본인의 강점|역량기술서|자기소개서|$)"),
    ("본인의 강점", r"(본인의 강점[^\n]*)(.*?)(?=\n\s*역량기술서|자기소개서|지원동기|$)"),
    ("자기소개서", r"(자기소개서)(.*?)(?=\n\s*역량기술서|지원동기|본인의 강점|$)"),
    ("역량기술서", r"(역량기술서)(.*?)(?=\n\s*자기소개서|지원동기|본인의 강점|$)"),
]
BULLET_SPLIT = re.compile(r"[•\-\u2022]|^\s*[0-9]+\.\s*", re.MULTILINE)
CHUNK_TRIG = 140   # 합친 문장 길이가 이 값 이상이면 컷
MIN_LEN    = 100   # 최종 유닛 최소 길이

def _normalize_txt(s):
    s=re.sub(r"\r","",s); s=re.sub(r"[ \t]+"," ",s); s=re.sub(r"\n{2,}","\n\n",s); return s.strip()

def _extract_sections(text):
    out=[]
    for name, pat in SECTION_PATTERNS:
        m=re.search(pat, text, flags=re.DOTALL|re.IGNORECASE)
        if m:
            body=_normalize_txt(m.group(2))
            if body: out.append((name, body))
    if not out: out=[("자기소개서", _normalize_txt(text))]
    return out

def _to_units(body: str) -> List[str]:
    paras=[p.strip() for p in body.split("\n\n") if p.strip()]
    units=[]
    for p in paras:
        bits=[b.strip() for b in BULLET_SPLIT.split(p) if b and b.strip()] or [p]
        buf=[]
        for b in bits:
            buf.append(b)
            joined=" ".join(buf)
            if len(joined) >= CHUNK_TRIG:
                units.append(joined); buf=[]
        if buf: units.append(" ".join(buf))
    units=[u for u in units if len(u)>=MIN_LEN]
    units=[u[:800] for u in units]  # 과도 길이 컷
    return units

def load_resumes_raw(resume_dir: str) -> List[Doc]:
    docs=[]
    for fp in sorted(glob.glob(os.path.join(resume_dir, "*_masked.txt"))):
        user_tag=os.path.basename(fp).replace("_masked.txt","")
        text=_normalize_txt(read_text(fp))
        secs=_extract_sections(text) or [("ALL", text)]
        for sec, body in secs:
            for u in _to_units(body):
                docs.append(Doc(str(uuid.uuid4()), "resume", sent_simplify(u),
                                {"user":user_tag,"type":"raw","section":sec}))
    return docs

def load_questions(q_dir: str) -> List[Doc]:
    docs=[]
    for fp in sorted(glob.glob(os.path.join(q_dir, "*.csv"))):
        df=pd.read_csv(fp)
        cols=[c for c in df.columns if "question" in c.lower()]
        if not cols: continue
        col=cols[0]
        cat=os.path.basename(fp).replace("_unique_questions.csv","").replace(".csv","")
        for q in df[col].fillna("").astype(str):
            q=sent_simplify(q)
            if q: docs.append(Doc(str(uuid.uuid4()), "question", q, {"category":cat}))
    return docs

def load_values_safe(root: str) -> List[Doc]:
    p=find_values_path(root)
    if not p: raise FileNotFoundError("인재상.txt를 /content/data 루트에 두세요.")
    d=literal_dict_from_txt(p)
    docs=[]
    for company, blob in d.items():
        kws=blob.get("키워드",[])
        for slogan, desc in blob.get("인재상",{}).items():
            for b in [
                f"{company}의 '{slogan}'을 보여준 사례를 말해달라: {desc}",
                f"{company} 가치 '{slogan}'에 부합하는 행동 기준을 충족한 경험(주도성/협업/문제해결 등)을 구체적으로 설명",
            ]:
                docs.append(Doc(str(uuid.uuid4()), "values", sent_simplify(b),
                                {"company":company,"slogan":slogan,"keywords":kws}))
    return docs

def load_jd(jd_dir: str) -> List[Doc]:
    docs=[]
    for fp in sorted(glob.glob(os.path.join(jd_dir, "*.txt"))):
        comp=os.path.basename(fp).split("_")[0]  # Carrot_JD.txt -> Carrot
        d=literal_dict_from_txt(fp)
        for role, spec in d.items():
            for sec in ["담당 업무","자격요건","우대사항"]:
                content=spec.get(sec,"")
                for line in split_bullets(content):
                    if line:
                        docs.append(Doc(str(uuid.uuid4()), "jd", sent_simplify(line),
                                        {"company_hint":comp,"role":role,"section":sec}))
    return docs


In [None]:
# =========================================
# 3) VectorStore (임베딩 캐시 포함)
# =========================================
class VectorStore:
    def __init__(self, model_name=EMB_MODEL, batch_size=EMB_BATCH):
        self.model = SentenceTransformer(model_name)
        self.batch = batch_size
        self.indexes: Dict[str, faiss.Index] = {}
        self.metadatas: Dict[str, List[Dict[str, Any]]] = {}
        self.cache_dir = os.path.join(OUT_DIR, "emb_cache")
        os.makedirs(self.cache_dir, exist_ok=True)

    def _cache_path(self, name):
        return os.path.join(self.cache_dir, f"{name}.db")

    def _embed_cached(self, name: str, texts: List[str]) -> np.ndarray:
        """
        md5(text) 단위 영구 캐시(shelve). 이미 계산된 문장은 즉시 재사용.
        """
        cache_file = self._cache_path(name)
        db = shelve.open(cache_file)
        dim = self.model.get_sentence_embedding_dimension()
        vecs = np.zeros((len(texts), dim), dtype="float32")
        to_compute_idx, to_compute_txt = [], []
        for i, t in enumerate(texts):
            key = md5(t)
            if key in db:
                vecs[i] = db[key]
            else:
                to_compute_idx.append(i); to_compute_txt.append(t)
        if to_compute_txt:
            for s in tqdm(range(0, len(to_compute_txt), self.batch), desc=f"embed[{name}]"):
                batch = to_compute_txt[s:s+self.batch]
                emb = self.model.encode(batch, batch_size=self.batch,
                                        normalize_embeddings=True, show_progress_bar=False)
                emb = np.asarray(emb, dtype="float32")
                for j, v in enumerate(emb):
                    idx = to_compute_idx[s+j]
                    vecs[idx] = v
                    db[md5(texts[idx])] = v
        db.close()
        return vecs

    def build(self, name: str, docs: List[Doc]):
        if not docs:
            print(f"[{name}] 0문서 → 건너뜀"); self.indexes[name]=None; self.metadatas[name]=[]; return
        texts=[d.text for d in docs]
        metas=[{"id":d.id, **d.meta, "corpus":d.corpus, "text":d.text} for d in docs]
        vecs=self._embed_cached(name, texts)
        index=faiss.IndexFlatIP(vecs.shape[1]); index.add(vecs)
        self.indexes[name]=index; self.metadatas[name]=metas
        print(f"[{name}] vectors:", index.ntotal)

    def save(self, name: str, out_dir=OUT_DIR):
        if self.indexes.get(name) is None:
            print(f"[{name}] 저장 스킵(인덱스 없음)"); return
        faiss.write_index(self.indexes[name], os.path.join(out_dir, f"{name}.faiss"))
        with open(os.path.join(out_dir, f"{name}.meta.json"), "w", encoding="utf-8") as f:
            json.dump(self.metadatas[name], f, ensure_ascii=False, indent=2)


In [None]:
# =========================================
# 4) 데이터 로드 → 인덱스 빌드/저장
# =========================================
resume_docs   = load_resumes_raw(os.path.join(DATA_ROOT, "resumes"))
question_docs = load_questions(os.path.join(DATA_ROOT, "questions"))
values_docs   = load_values_safe(DATA_ROOT)
jd_docs       = load_jd(os.path.join(DATA_ROOT, "jd"))
print("counts:", len(resume_docs), len(question_docs), len(values_docs), len(jd_docs))

vs = VectorStore()
vs.build("resume", resume_docs)
vs.build("question", question_docs)
vs.build("values", values_docs)
vs.build("jd", jd_docs)
for n in ["resume","question","values","jd"]:
    vs.save(n)
print("저장 완료 ->", OUT_DIR)


counts: 5 3708 116 5903


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

embed[resume]:   0%|          | 0/1 [00:00<?, ?it/s]

[resume] vectors: 5


embed[question]:   0%|          | 0/58 [00:00<?, ?it/s]

[question] vectors: 3708


embed[values]:   0%|          | 0/2 [00:00<?, ?it/s]

[values] vectors: 116


embed[jd]:   0%|          | 0/93 [00:00<?, ?it/s]

[jd] vectors: 5903
저장 완료 -> /content/artifacts


In [None]:
# =========================================
# 5) 검색 (리랭커 없이: 메타 필터 + 코사인 점수 임계치)
# =========================================
def _filter_ids(metas, filters):
    if not filters: return list(range(len(metas)))
    keep=[]
    for i,m in enumerate(metas):
        ok=True
        for k,v in filters.items():
            if k not in m or fuzz.partial_ratio(str(v), str(m.get(k,""))) < 80:
                ok=False; break
        if ok: keep.append(i)
    return keep

def search_simple(corpus_name, query, filters=None, topk=5, min_cos=0.0):
    idx_path = os.path.join(OUT_DIR, f"{corpus_name}.faiss")
    meta_path= os.path.join(OUT_DIR, f"{corpus_name}.meta.json")
    if not (os.path.exists(idx_path) and os.path.exists(meta_path)):
        return []
    idx   = faiss.read_index(idx_path)
    metas = json.load(open(meta_path, encoding="utf-8"))
    if idx.ntotal == 0 or not metas: return []

    ids = _filter_ids(metas, filters)
    if not ids: return []

    sub = [metas[i] for i in ids]
    model = SentenceTransformer(EMB_MODEL)
    sub_vecs = model.encode([m["text"] for m in sub], normalize_embeddings=True, batch_size=EMB_BATCH, show_progress_bar=False)
    sub_idx  = faiss.IndexFlatIP(sub_vecs.shape[1]); sub_idx.add(np.asarray(sub_vecs, dtype="float32"))

    qv = model.encode([query], normalize_embeddings=True, show_progress_bar=False)
    D, I = sub_idx.search(np.asarray(qv, dtype="float32"), min(topk*5, len(sub)))  # 여유 후보
    rows=[]
    for j, sc in zip(I[0], D[0]):
        if j == -1: continue
        if sc < min_cos: continue
        m = sub[int(j)].copy(); m["score"]=float(sc); rows.append(m)
    rows.sort(key=lambda x: x["score"], reverse=True)
    return rows[:topk]

def pretty(rows, head):
    print("\n▶", head)
    if not rows:
        print("- (결과 없음)");
        return
    for r in rows:
        tag = " | ".join([str(r[k]) for k in ["company_hint","role","section","category","company","slogan","user","type"] if k in r]) or "-"
        print(f"- ({r.get('score',0.0):.3f}) [{tag}] {r['text'][:120]}...")


In [None]:
# =========================================
# 6) 예시 쿼리
# =========================================
# (A) JD: Carrot
q1 = "광고 도메인에서 SQL과 대시보드로 지표 설계/분석했던 경험아 있습니다."
pretty(search_simple("jd", q1, filters={"company_hint":"Carrot"}, topk=5, min_cos=0.30), "JD 매칭 (Carrot)")

# (B) 질문: ICT
q2 = "머신러닝 프로젝트에서 모델 선택과 하이퍼파라미터 튜닝 경험"
pretty(search_simple("question", q2, filters={"category":"ICT"}, topk=5, min_cos=0.30), "질문 후보 (ICT)")

# (C) 인재상: 당근마켓
q3 = "주도적으로 개선안을 만들고 빠르게 실험한 경험"
pretty(search_simple("values", q3, filters={"company":"당근마켓"}, topk=5, min_cos=0.30), "인재상 근거 (Carrot)")
