In [57]:
import pandas as pd
import sys
import numpy as np
import os

sys.path.append('../../system/')
# from parser import run_parser, convert_pdf_to_jpg     #for image preprocess
from langchain.text_splitter import RecursiveCharacterTextSplitter
from get_similarity.utils.preprocess import preprocess
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain_chroma import Chroma
from configs import JD_PATH, COLLECTION, DB_PATH

from insert_chunks import *
from tqdm import tqdm
from uuid import uuid4

import pickle
from langchain_community.retrievers import BM25Retriever
from pathlib import Path
from nltk.tokenize import word_tokenize


from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

# 파인콘 키 설정
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Preprocess to make chunk

In [2]:
def preprocess(df):
    df.dropna(subset=['description', 'is_remote'], inplace=True)
    df = df.reset_index(drop=True)
    return df

In [3]:
JD_PATH = "updated_jd"
jd_folder = JD_PATH
full_paths = []
for jd_path in os.listdir(jd_folder):
    full_paths.append(os.path.join(jd_folder, jd_path))

### vectorDB에 들어갈 파일들
for path in full_paths:
    print(path)

updated_jd/USA_Back-End_jobs_total_filtered.csv
updated_jd/Germany_marketing_jobs_total_filtered.csv
updated_jd/Germany_Back-End_jobs_total_filtered.csv
updated_jd/UK_Machine Learning_jobs_total_filtered.csv
updated_jd/UK_mechanical engineer_jobs_total_filtered.csv
updated_jd/USA_mechanical engineer_jobs_total_filtered.csv
updated_jd/Germany_Machine Learning_jobs_total_filtered.csv
updated_jd/Germany_Front-End_jobs_total_filtered.csv
updated_jd/Germany_mechanical engineer_jobs_total_filtered.csv
updated_jd/UK_marketing_jobs_total_filtered.csv
updated_jd/UK_Front-End_jobs_total_filtered.csv
updated_jd/USA_marketing_jobs_total_filtered.csv
updated_jd/USA_Machine Learning_jobs_total_filtered.csv
updated_jd/USA_Front-End_jobs_total_filtered.csv
updated_jd/UK_Back-End_jobs_total_filtered.csv


In [4]:
all_dfs = []

for path in full_paths:
    df = pd.read_csv(path)
    df["location"] = path.split("/")[-1].split("_")[0]
    # print(df.shape)
    all_dfs.append(df)
# 하나의 DataFrame으로 병합
merged_df = pd.concat(all_dfs, ignore_index=True)
# 확인
print(merged_df.shape)

(1184, 8)


In [5]:
#description 기준 중복값 제거
print(len(merged_df["description"].unique()))
merged_df_dedup = merged_df.drop_duplicates(subset="description")
# 중복 제거 후 행 수 출력
print(f"중복 제거 후 description 개수: {len(merged_df_dedup)}")

704
중복 제거 후 description 개수: 704


In [6]:
def classify_jobpype(df):
    remove_index = []
    for idx, data in df.iterrows():
        job_type = data["job_type"]
        if "fulltime" in job_type:
            label="fulltime"
        elif "parttime" in job_type:
            label="parttime"
        elif "contract" in job_type:
            label="fulltime"
        elif "internship" in job_type:
            label="fulltime"
        else:
            remove_index.append(idx)

        df.at[idx, "job_type"] = label

    for index in remove_index:
        print(f"Removing index: {index}")
        df.drop(index, inplace=True)
        
    return df

In [7]:
merged_df_dedup = classify_jobpype(merged_df_dedup)
print(len(merged_df_dedup))
max_len = 10000
merged_df_dedup = merged_df_dedup[merged_df_dedup['description'].apply(lambda x: len(x) <= max_len if isinstance(x, str) else True)]
print(len(merged_df_dedup))

Removing index: 566
703
691


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(index, inplace=True)


In [8]:
print(merged_df_dedup["location"].unique())
print(merged_df_dedup["is_remote"].unique())
print(merged_df_dedup["job_type"].unique())

['USA' 'Germany' 'UK']
[ True False]
['fulltime' 'parttime']


In [9]:
final_df = preprocess(merged_df_dedup)

# Make vectorDB

In [12]:
def set_splitter(emb_model):
    """splitter를 셋업하는 함수입니다."""
    ### 아래 splitter를 사용해 기존 phase1의 vectorDB와 동일한 전처리 가능
    # text_splitter = SemanticChunker(emb_model, breakpoint_threshold_type="percentile", breakpoint_threshold_amount=90)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100, separators=["#","##,""###","####","**","---","\r\n","\n\n", "\n", "\t", " ", ""])
    return text_splitter

In [14]:
def get_chunks(df, text_splitter):
    total_chunks = []
    for i, desciption in enumerate(df["description"]):
        meta_data = [df.iloc[i].to_dict()]
        unique_id = str(uuid4())    #랜덤 id 생성
        ## RRF를 위해 id를 메타데이터로 추가
        ## 한 문서에 여러 개의 chunk가 생기기 때문에 vectorDB의 id로 추가하면 중복되서 사라진다
        meta_data[0]["id"] = unique_id

        chunks = text_splitter.create_documents([desciption], meta_data)
        total_chunks.extend(chunks)
    return total_chunks

In [15]:
emb_model = load_emb_model()
total_chunks = get_chunks(final_df, set_splitter(emb_model))   # semantic chunking, langchain document list 반환

In [16]:
len(total_chunks)

5174

In [65]:
print(total_chunks[0].metadata["description"])

Full Stack Developer, Senior**The Opportunity:**


As a full stack developer, you can resolve a problem with a complete end\\-to\\-end solution in a fast, agile environment. If you’re looking for the chance to not just develop software, but to help create a system that will make a difference, we need you on our team. We’re looking for a developer like you with the skills needed to develop software and systems from vision to production ready.


This role is more than just coding. As a full stack developer at Booz Allen, you’ll use your passion to learn new tools and techniques and identify needed system improvements. You’ll help clients overcome their most difficult challenges using the latest architectural approaches, tools, and technologies. You’ll help make sure the solution developed by the team considers the current architecture and operating environment, as well as future functionality and enhancements.


You will develop codes, test, and debug new software or enhancements to exis

In [17]:
for t in total_chunks[:5]:
    print(t.page_content[:50])
    print("==="*20)

Full Stack Developer, Senior**The Opportunity:
**


As a full stack developer, you can resolve a 
You will develop codes, test, and debug new softwa
**You Have:**

* 10\\+ years of experience as a ba
**Nice If You Have:**

* Experience with developme


In [52]:
#만약 인덱
[i["name"] for i in pc.list_indexes()]

['quickstart', 'temp', 'previous-chunking', 'jd-dataset']

In [55]:
index_name = "temp"       #DB의 collection명으로 보면 됨
# index_name = "jd-dataset"       #DB의 collection명으로 보면 됨

# index = pc.Index(index_name)

In [56]:
if pc.has_index(index_name):
    print("인덱스가 이미 존재합니다")
    ## 데이터가 남아있을때 데이터 제거
    index = pc.Index(index_name)
    if len(index.describe_index_stats()["namespaces"]) > 0:
        index.delete(delete_all=True, namespace="")
else:
    ## openAI의 embedding dimension과 동일
    ## dimension은 embedding model을 변경한다면 설정하기
    pc.create_index(index_name, dimension=1536, metric="cosine",
        spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) ) #서버리스 인덱스 생성, 무료는 us-east-1만 가능

인덱스가 이미 존재합니다


In [None]:
vector_store = PineconeVectorStore(index=index, embedding=emb_model)

vector_store.add_documents(documents=total_chunks)
print("Pinecone DB 세팅 완료")

In [None]:
#### 같은 index를 공유하기 때문에 반드시 Pinecone 세팅이후 바로 돌려야함
# bm25retriever = BM25Retriever.from_documents(total_chunks, preprocess_func=clean_tokens, k=10)
bm25retriever = BM25Retriever.from_documents(total_chunks, preprocess_func=word_tokenize, k=10)

SAVE_PATH = Path("bm25_retriever_final.pkl")
with SAVE_PATH.open("wb") as f:
    pickle.dump(bm25retriever, f)
print("BM25 retriever 세팅 완료")

BM25 retriever 세팅 완료


In [162]:
bm25retriever.get_relevant_documents("data engineer") # query에 대한 top-k 문서 반환

  bm25retriever.get_relevant_documents("data engineer") # query에 대한 top-k 문서 반환


[Document(metadata={'job_url': 'https://www.indeed.com/viewjob?jk=75e354ef2304575e', 'title': 'Data Engineer', 'company': 'Booz Allen Hamilton', 'location': 'USA', 'date_posted': '2025-05-08', 'job_type': 'fulltime', 'is_remote': True, 'description': "Data Engineer**The Opportunity:**\n\n\nEver\\\\-expanding technology like IoT, machine learning, and artificial intelligence means that there’s more structured and unstructured data available today than ever before. As a data engineer, you know that organizing big data can yield pivotal insights when it’s gathered from disparate sources. We need an experienced data engineer like you to help our clients find answers in their big data to impact important missions—from fraud detection to cancer research to national intelligence.\n\n  \n\nAs a data engineer at Booz Allen, you’ll implement data engineering activities on some of the most mission\\\\-driven projects in the industry. You’ll deploy and develop pipelines and platforms that organize

# 메타데이터 필터링

In [75]:
vector_store.similarity_search("I want to find a job as a data scientist", k=5, namespace="")    #유사도 검색
# 중복 제거 한번 하고 들어가야할듯

[Document(id='1b4af814-611c-4b62-a79a-21c15f8a75f1', metadata={'company': 'Booz Allen Hamilton', 'date_posted': '2025-05-08', 'description': 'Data Scientist**The Opportunity:**\n\nAs a data scientist, you’re excited at the prospect of unlocking the secrets held by a data set, and you’re fascinated by the possibilities presented by IoT, machine learning, and artificial intelligence. In an increasingly connected world, massive amounts of structured and unstructured data open new opportunities. As a data scientist at Booz Allen, you can help turn these complex data sets into useful information to solve global challenges. Across private and public sectors from fraud detection to cancer research, to national intelligence, we need you to help find the answers in the data.\n\n\nOn our team, you’ll use your leadership skills and data science expertise to create real\\\\-world impact. You’ll work closely with clients to understand their questions and needs, and then dig into their data\\\\-rich

In [76]:
vector_store.similarity_search(
    "I want to find a job as a data scientist",
    k=5,
    filter={"job_type": "temp"},
)

[]

In [89]:
vector_store.similarity_search(
    "I want to find a job as a data scientist",
    k=10,
    filter={"job_type": "fulltime",
            "is_remote":True,
            "location": "USA"},
)

[Document(id='1b4af814-611c-4b62-a79a-21c15f8a75f1', metadata={'company': 'Booz Allen Hamilton', 'date_posted': '2025-05-08', 'description': 'Data Scientist**The Opportunity:**\n\nAs a data scientist, you’re excited at the prospect of unlocking the secrets held by a data set, and you’re fascinated by the possibilities presented by IoT, machine learning, and artificial intelligence. In an increasingly connected world, massive amounts of structured and unstructured data open new opportunities. As a data scientist at Booz Allen, you can help turn these complex data sets into useful information to solve global challenges. Across private and public sectors from fraud detection to cancer research, to national intelligence, we need you to help find the answers in the data.\n\n\nOn our team, you’ll use your leadership skills and data science expertise to create real\\\\-world impact. You’ll work closely with clients to understand their questions and needs, and then dig into their data\\\\-rich

In [90]:
vector_store.similarity_search(
    "I want to find a job as a data scientist",
    k=10,
    filter={},
)

[Document(id='1b4af814-611c-4b62-a79a-21c15f8a75f1', metadata={'company': 'Booz Allen Hamilton', 'date_posted': '2025-05-08', 'description': 'Data Scientist**The Opportunity:**\n\nAs a data scientist, you’re excited at the prospect of unlocking the secrets held by a data set, and you’re fascinated by the possibilities presented by IoT, machine learning, and artificial intelligence. In an increasingly connected world, massive amounts of structured and unstructured data open new opportunities. As a data scientist at Booz Allen, you can help turn these complex data sets into useful information to solve global challenges. Across private and public sectors from fraud detection to cancer research, to national intelligence, we need you to help find the answers in the data.\n\n\nOn our team, you’ll use your leadership skills and data science expertise to create real\\\\-world impact. You’ll work closely with clients to understand their questions and needs, and then dig into their data\\\\-rich

In [82]:
retriever.invoke("temp")

[Document(id='b0d9cd9b-dd12-4d8a-bf28-666fe089094a', metadata={'company': 'Stellar Trust', 'date_posted': '2025-04-23', 'description': 'Join Stellar, as we are building the first 360 degree trust platform and embedded insurance product for the short\\\\-term rental industry.\n  \n  \n\nFounded by serial entrepreneur and investor Philipp Reuter (Co\\\\-Founder of STR management software Smoobu) and Milan Plogsties (ex Project Leader at BCG and MD at HomeToGo), we are solving one of the most pressing problems in the industry: building trust between guests and hosts. With over 1 billion nights booked in short\\\\-term rentals in the EU alone, the opportunity is huge and we have everything we need to succeed: financial backing from some of the most experienced founders \\\\& investors in travel tech and the short\\\\-term rental industry, a great team and a number of pilot customers.\n  \n  \n\nHaving made significant progress in developing the product with our pilot customers and freelanc

In [None]:
retriever = vector_store.as_retriever(embedding_function=emb_model, search_kwargs={"k": 3,
                                      "filter":{"job_type": "fulltime",
            "is_remote":True,
            "location": "USA"} })  # top-3 문서 검색  # 임베딩 모델 명시적 지정


In [84]:
retriever.invoke("I want to find a job as a data scientist",)

[Document(id='1b4af814-611c-4b62-a79a-21c15f8a75f1', metadata={'company': 'Booz Allen Hamilton', 'date_posted': '2025-05-08', 'description': 'Data Scientist**The Opportunity:**\n\nAs a data scientist, you’re excited at the prospect of unlocking the secrets held by a data set, and you’re fascinated by the possibilities presented by IoT, machine learning, and artificial intelligence. In an increasingly connected world, massive amounts of structured and unstructured data open new opportunities. As a data scientist at Booz Allen, you can help turn these complex data sets into useful information to solve global challenges. Across private and public sectors from fraud detection to cancer research, to national intelligence, we need you to help find the answers in the data.\n\n\nOn our team, you’ll use your leadership skills and data science expertise to create real\\\\-world impact. You’ll work closely with clients to understand their questions and needs, and then dig into their data\\\\-rich

In [78]:
print(merged_df_dedup["location"].unique())
print(merged_df_dedup["is_remote"].unique())
print(merged_df_dedup["job_type"].unique())

['USA' 'Germany' 'UK']
[ True False]
['fulltime' 'parttime']


## BM25 test

In [10]:
from langchain_community.retrievers import BM25Retriever

In [11]:
import string

def clean_tokens(text: str):
    """공백 기준 토큰화 후 특수문자 제거·소문자 변환"""
    tokens = text.split()                       # ① 공백 기준 분리
    cleaned = []
    for tok in tokens:
        # ② 토큰 앞뒤 특수문자 제거  ( ###Job**  →  Job )
        tok = tok.strip(string.punctuation)
        # ③ 소문자 변환
        tok = tok.lower()
        # ④ 빈 토큰·순수 특수문자 토큰은 건너뛰기
        if tok and not all(ch in string.punctuation for ch in tok):
            cleaned.append(tok)
    return cleaned

# sample = """##### **Job Type: Contract** ##### **Job Category: IT** #### **Job Description**
# Job Title: Azure GenAI Engineer ..."""
# print(clean_tokens(sample)[:40])   # 앞 40개 토큰만 확인

In [12]:
#기본 tokenizer는 기본 sentence.split()으로 되어있음, markdown 불용어 처리만 더하여 토크나이징
retriever = BM25Retriever.from_documents(total_chunks, preprocess_func=clean_tokens)

# BM25 저장

In [118]:
import string
from langchain_community.retrievers import BM25Retriever

def clean_tokens(text: str):
    """공백 기준 토큰화 후 특수문자 제거·소문자 변환"""
    tokens = text.split()                       # ① 공백 기준 분리
    cleaned = []
    for tok in tokens:
        # ② 토큰 앞뒤 특수문자 제거  ( ###Job**  →  Job )
        tok = tok.strip(string.punctuation)
        # ③ 소문자 변환
        tok = tok.lower()
        # ④ 빈 토큰·순수 특수문자 토큰은 건너뛰기
        if tok and not all(ch in string.punctuation for ch in tok):
            cleaned.append(tok)
    return cleaned


lexical_db = BM25Retriever.from_documents(total_chunks, preprocess_func=clean_tokens, k=10)

In [121]:
import pickle
from pathlib import Path

SAVE_PATH = Path("bm25_retriever.pkl")

# ── 저장 ─────────────────────────────────────────────
with SAVE_PATH.open("wb") as f:
    pickle.dump(lexical_db, f)

# ── 로드 ─────────────────────────────────────────────
with SAVE_PATH.open("rb") as f:
    lexical_db_loaded = pickle.load(f)

# 확인
docs = lexical_db_loaded.get_relevant_documents("quick test query")
print(docs[:3])


[Document(metadata={'job_url': 'https://uk.indeed.com/viewjob?jk=31cba2efecd20416', 'title': 'React Developer - Front End', 'company': 'Xelix', 'location': 'London, ENG, GB', 'date_posted': '2025-05-01', 'job_type': 'fulltime', 'is_remote': False, 'description': "**About us**\n\n\nWe’re Xelix, an AI\\\\-powered Control Centre for Accounts Payable teams. We work with some of the largest global companies to automate and enhance their financial control processes. At the heart of our product, we leverage machine learning techniques developed by our AI Engineering team to provide a more sophisticated offering than existing solutions.\n\n\nThings are going really well for us \\\\- we raised our Series A funding round from top investors, we’ve grown our team to almost 100, and we’ve won industry awards for our products.\n\n**About the role**\n\n\nWe’re looking for an experienced and motivated Senior React Developer to join our growing development team. In this role, you will implement user in

In [119]:
lexical_db.get_relevant_documents("temp")

  lexical_db.get_relevant_documents("temp")


[Document(metadata={'job_url': 'https://www.linkedin.com/jobs/view/4224181634', 'title': 'Web Developer', 'company': 'Synergy ECP', 'location': 'Annapolis Junction, MD', 'date_posted': '2025-04-11', 'job_type': 'fulltime', 'is_remote': False, 'description': "SoftTech Solutions, LLC., a wholly owned subsidiary of Synergy ECP, is a custom software development firm that has specialized in serving Department of Defense customers for more than 15 years. Our highly skilled staff of Software Engineers provide expertise that strengthens and advances our clients projects, ensuring the delivery of solid, successful results. Well\\\\-versed in the latest technologies and knowledgeable in a broad spectrum of skills and programming languages, our engineers and computer sciences professionals offer scalable technology solutions for enterprises of all sizes.\n   \n\n  \n\n We are looking for a\n **Web Developer Level 1** \n with experience with:\n   \n\n  \n\n**User Experience (UX) Design**\n Adobe I