In [9]:
import os
import json
from pathlib import Path

# json 디렉토리 경로
json_dir = Path("/home/food/people/minju/make_data/final_data")

# 결과 저장할 리스트
merged_data = []

# 디렉토리 내 모든 .json 파일 읽기
for file in sorted(json_dir.glob("*.json")):
    with open(file, "r", encoding="utf-8-sig") as f:
        try:
            data = json.load(f)
            # JSON이 리스트일 경우
            if isinstance(data, list):
                merged_data.extend(data)
            # JSON이 객체일 경우
            elif isinstance(data, dict):
                merged_data.append(data)
        except Exception as e:
            print(f"⚠️ {file.name} 읽는 중 오류: {e}")

# -----------------------------
# ID 자동 재정렬
# -----------------------------
for new_id, item in enumerate(merged_data, start=1):
    item["id"] = new_id

# 합친 결과 저장
out_path = "merged.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(merged_data, f, ensure_ascii=False, indent=2)

print(f"총 {len(merged_data)}개 항목을 {out_path} 파일에 저장 완료 ✅")


총 1000개 항목을 merged.json 파일에 저장 완료 ✅


In [7]:
import os
import pickle
import numpy as np
from langchain_teddynote.retrievers import KiwiBM25Retriever

# ==============================
# 1) 경로 설정
# ==============================
bm25_dir = "/home/food/people/minju/embedding/bm25"
labse_dir = "/home/food/people/subin/data/embeddings/LaBSE"

bm25_paths = sorted([os.path.join(bm25_dir, f) for f in os.listdir(bm25_dir) if f.endswith(".pkl")])
labse_paths = sorted([os.path.join(labse_dir, f) for f in os.listdir(labse_dir) if f.endswith(".pkl")])

print(f"[INFO] BM25 파일 수: {len(bm25_paths)}")
print(f"[INFO] LaBSE 파일 수: {len(labse_paths)}")

# ==============================
# 2) BM25 retriever 통합
# ==============================
all_docs = []
for path in bm25_paths:
    with open(path, "rb") as f:
        bm25_tmp = pickle.load(f)
    all_docs.extend(bm25_tmp.docs)  # ✅ docs 사용
    print(f"[INFO] {path} → {len(bm25_tmp.docs)}개 추가")

merged_bm25 = KiwiBM25Retriever.from_documents(all_docs)
merged_bm25.k = 10

with open(os.path.join(bm25_dir, "bm25_all.pkl"), "wb") as f:
    pickle.dump(merged_bm25, f)

print(f"[INFO] ✅ BM25 전체 통합 완료 → {len(all_docs)}개 문서")

# ==============================
# 3) LaBSE 임베딩 통합
# ==============================
all_embeddings = []
for path in labse_paths:
    with open(path, "rb") as f:
        emb = pickle.load(f)
    all_embeddings.append(emb)
    print(f"[INFO] {path} → {emb.shape} 추가")

# 전체 임베딩 합치기
merged_embeddings = np.vstack(all_embeddings)
print(f"[INFO] ✅ LaBSE 전체 통합 완료 → {merged_embeddings.shape}")

# 새 파일로 저장
with open(os.path.join(labse_dir, "embeddings_all.pkl"), "wb") as f:
    pickle.dump(merged_embeddings, f)

print("[INFO] ✅ BM25 + LaBSE 통합 저장 완료")


[INFO] BM25 파일 수: 11
[INFO] LaBSE 파일 수: 11
[INFO] /home/food/people/minju/embedding/bm25/bm25_2014.pkl → 15693개 추가
[INFO] /home/food/people/minju/embedding/bm25/bm25_2015.pkl → 16245개 추가
[INFO] /home/food/people/minju/embedding/bm25/bm25_2016.pkl → 17321개 추가
[INFO] /home/food/people/minju/embedding/bm25/bm25_2017.pkl → 18711개 추가
[INFO] /home/food/people/minju/embedding/bm25/bm25_2018.pkl → 18950개 추가
[INFO] /home/food/people/minju/embedding/bm25/bm25_2019.pkl → 19457개 추가
[INFO] /home/food/people/minju/embedding/bm25/bm25_2020.pkl → 19607개 추가
[INFO] /home/food/people/minju/embedding/bm25/bm25_2021.pkl → 23135개 추가
[INFO] /home/food/people/minju/embedding/bm25/bm25_2022.pkl → 22233개 추가
[INFO] /home/food/people/minju/embedding/bm25/bm25_2023.pkl → 20775개 추가
[INFO] /home/food/people/minju/embedding/bm25/bm25_2024.pkl → 23590개 추가
[INFO] ✅ BM25 전체 통합 완료 → 215717개 문서
[INFO] /home/food/people/subin/data/embeddings/LaBSE/embeddings_2014.pkl → (15693, 768) 추가
[INFO] /home/food/people/subin/data/em

In [13]:
import numpy as np
import pickle
import os

labse_dir = '/home/food/people/minju/embedding/labse2'

# 연도 기준으로 정렬된 경로 가져오기
labse_paths = sorted(
    [os.path.join(labse_dir, f) for f in os.listdir(labse_dir) if f.endswith(".pkl")],
    key=lambda x: int(os.path.basename(x).split("_")[1].split(".")[0])
)

all_embeddings = []

for path in labse_paths:
    with open(path, "rb") as f:
        emb = pickle.load(f)
    print(f"[INFO] {path} → {emb.shape} 추가")
    all_embeddings.append(emb)

# 통합 임베딩 생성
merged_embeddings = np.vstack(all_embeddings)

merged_path = os.path.join(labse_dir, "embeddings_all.pkl")
with open(merged_path, "wb") as f:
    pickle.dump(merged_embeddings, f)

print(f"\n✅ 전체 임베딩 통합 저장 완료 → {merged_path}")
print(f"[INFO] 전체 shape: {merged_embeddings.shape}")


[INFO] /home/food/people/minju/embedding/labse2/embeddings_2014.pkl → (15693, 768) 추가
[INFO] /home/food/people/minju/embedding/labse2/embeddings_2015.pkl → (16245, 768) 추가
[INFO] /home/food/people/minju/embedding/labse2/embeddings_2016.pkl → (17321, 768) 추가
[INFO] /home/food/people/minju/embedding/labse2/embeddings_2017.pkl → (18711, 768) 추가
[INFO] /home/food/people/minju/embedding/labse2/embeddings_2018.pkl → (18950, 768) 추가
[INFO] /home/food/people/minju/embedding/labse2/embeddings_2019.pkl → (19457, 768) 추가
[INFO] /home/food/people/minju/embedding/labse2/embeddings_2020.pkl → (19607, 768) 추가
[INFO] /home/food/people/minju/embedding/labse2/embeddings_2021.pkl → (23135, 768) 추가
[INFO] /home/food/people/minju/embedding/labse2/embeddings_2022.pkl → (22233, 768) 추가
[INFO] /home/food/people/minju/embedding/labse2/embeddings_2023.pkl → (20775, 768) 추가
[INFO] /home/food/people/minju/embedding/labse2/embeddings_2024.pkl → (23590, 768) 추가

✅ 전체 임베딩 통합 저장 완료 → /home/food/people/minju/embedding

In [29]:
labse_dir = '/home/food/people/minju/embedding/labse2'

merged_path = os.path.join(labse_dir, "embeddings_all.pkl")
with open(merged_path, "rb") as f:
    merged_embeddings = pickle.load(f)

faiss.normalize_L2(merged_embeddings)
index = faiss.IndexFlatIP(merged_embeddings.shape[1])
index.add(merged_embeddings)
faiss.write_index(index, os.path.join(labse_dir, "faiss_index_all.index"))

In [32]:
index = faiss.read_index(os.path.join(labse_dir, "faiss_index_all.index"))
# 예시: 첫 번째 쿼리로 검색
q = merged_embeddings[2015].reshape(1, -1)
D, I = index.search(q, 5)

print("Distances:", D)
print("Indices:", I)


Distances: [[1.         0.85433745 0.85085535 0.83823645 0.8341411 ]]
Indices: [[ 2015 27816  2028 63629 21948]]


In [28]:
q = EMBED_MODEL.encode([DATA[0]], convert_to_numpy=True, normalize_embeddings=True)
D, I = index.search(q, 5)
for rank, idx in enumerate(I[0]):
    print(f"Top {rank+1}: {DATA[idx]}")


NameError: name 'EMBED_MODEL' is not defined

In [17]:
import pickle
import numpy as np

# 병합된 임베딩 로드
with open("/home/food/people/minju/embedding/labse2/embeddings_all.pkl", "rb") as f:
    merged_embeddings = pickle.load(f)

# 예시로 2014 첫 번째 문서 임베딩 로드
with open("/home/food/people/minju/embedding/labse2/embeddings_2024.pkl", "rb") as f:
    emb_2014 = pickle.load(f)

# 2014 첫 번째 벡터 vs 병합 벡터 비교
print(np.allclose(emb_2014[0], merged_embeddings[0]))  # True면 정상, False면 순서 꼬임


False


In [19]:
import pickle

labse_dir = "/home/food/people/minju/embedding/labse2"
labse_paths = sorted(
    [os.path.join(labse_dir, f) for f in os.listdir(labse_dir) if f.endswith(".pkl")],
    key=lambda x: int(os.path.basename(x).split("_")[1].split(".")[0])
)

for path in labse_paths:
    with open(path, "rb") as f:
        emb = pickle.load(f)
    print(path, emb.shape)


/home/food/people/minju/embedding/labse2/embeddings_2014.pkl (15693, 768)
/home/food/people/minju/embedding/labse2/embeddings_2015.pkl (16245, 768)
/home/food/people/minju/embedding/labse2/embeddings_2016.pkl (17321, 768)
/home/food/people/minju/embedding/labse2/embeddings_2017.pkl (18711, 768)
/home/food/people/minju/embedding/labse2/embeddings_2018.pkl (18950, 768)
/home/food/people/minju/embedding/labse2/embeddings_2019.pkl (19457, 768)
/home/food/people/minju/embedding/labse2/embeddings_2020.pkl (19607, 768)
/home/food/people/minju/embedding/labse2/embeddings_2021.pkl (23135, 768)
/home/food/people/minju/embedding/labse2/embeddings_2022.pkl (22233, 768)
/home/food/people/minju/embedding/labse2/embeddings_2023.pkl (20775, 768)
/home/food/people/minju/embedding/labse2/embeddings_2024.pkl (23590, 768)


In [22]:
import pickle

with open("/home/food/people/minju/embedding/labse2/embeddings_all.pkl", "rb") as f:
    emb = pickle.load(f)

# 2014 마지막 인덱스 ~ 2015 첫 번째 인덱스 비교
end_2014 = 15693
print("2014 끝 shape:", emb[end_2014 - 1][:5])
print("2015 시작 shape:", emb[end_2014][:5])


2014 끝 shape: [-0.06868254  0.00044297 -0.01582774 -0.02633454 -0.02802047]
2015 시작 shape: [-0.05458421  0.01762388  0.03520567 -0.0621579  -0.02756296]


In [23]:
import faiss
import numpy as np
import pickle

# 1) 임베딩 로드
with open("/home/food/people/minju/embedding/labse2/embeddings_all.pkl", "rb") as f:
    emb = pickle.load(f).astype(np.float32)

print("[INFO] embeddings shape:", emb.shape)  # (215717, 768)

# 2) FAISS 인덱스 생성 (L2 거리 or Inner Product)
d = emb.shape[1]
index = faiss.IndexFlatIP(d)  # Inner Product, 코사인 유사도 원하면 normalize 필요

# 3) 임베딩 추가
index.add(emb)
print("[INFO] FAISS index size:", index.ntotal)  # 215717 나와야 정상

# 4) 저장
faiss.write_index(index, "/home/food/people/minju/embedding/labse2/faiss_labse.index")
print("✅ FAISS 인덱스 저장 완료!")


[INFO] embeddings shape: (215717, 768)
[INFO] FAISS index size: 215717
✅ FAISS 인덱스 저장 완료!


In [25]:
# 인덱스 로드
index = faiss.read_index("/home/food/people/minju/embedding/labse2/faiss_labse.index")

# 특정 인덱스 검색
query_idx = 15693  # 2015 첫 번째 벡터
query_vec = emb[query_idx].reshape(1, -1)

D, I = index.search(query_vec, k=5)
print(f"Query idx={query_idx}")
print("Top-5 indices:", I)
print("Top-5 scores:", D)

# 매핑 체크
if query_idx in I[0]:
    print("✅ 매핑 정상")
else:
    print("❌ 매핑 깨짐")


Query idx=15693
Top-5 indices: [[ 15693  80029 161449  78890  24565]]
Top-5 scores: [[1.         0.71319234 0.7081472  0.70359325 0.7004778 ]]
✅ 매핑 정상


In [26]:
query_idx = 16000  # 2015 초반 샘플
D, I = index.search(embeddings[query_idx:query_idx+1], k=5)
print("Query index:", query_idx)
print("Retrieved indices:", I)
print("Distances:", D)


Query index: 16000
Retrieved indices: [[16000  2771 30559  3895  7390]]
Distances: [[0.99999994 0.8161235  0.8136665  0.81041515 0.80499923]]
