In [None]:
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Optional

import numpy as np
from sentence_transformers import SentenceTransformer
import torch

os.chdir('C:/Users/SAMSUNG/Desktop/Grad_School/RAG_LAW')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class LawEmbeddings:
    def __init__(self, model_name : str = "BAAI/bge-m3"):
        self.model_name = model_name
        self.model = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def load_model(self):
        """모델 로딩"""
        if self.model is None:
            try:
                self.model = SentenceTransformer(self.model_name, device = self.device)
                self.model.max_seq_length = 512
            except Exception as e:
                print(f"모델 로드 실패 : {e}")
                raise 

    def create_embeddings(self, laws_parsed : List[Dict]) -> List[np.ndarray]:
        """임베딩 생성"""        
        self.load_model()
        texts = [doc.get('embedding_text',"") for doc in laws_parsed]
        try:
            embeddings = self.model.encode(
                texts,
                batch_size = 128,
                show_progress_bar = True,
                convert_to_numpy = True,
                normalize_embeddings = True
            )
            return embeddings
        except Exception as e:
            print(f"임베딩 실패 : {e}")
            raise

    def create_query_embedding(self, text : str) -> np.ndarray:
        """쿼리 임베딩 생성"""
        self.load_model()

        try:
            embedding = self.model.encode(
                [text],
                convert_to_numpy = True,
                normalize_embeddings = True
            )[0]
            return embedding
        except Exception as e:
            print(f"쿼리 임베딩 실패 : {e}")
            raise

    def save_embeddings(self, embeddings : List[np.ndarray], filename : str):
        """임베딩 저장"""
        np.save(filename, embeddings)


In [None]:
# py 파일을 위해 
if __name__ == "__main__":
    with open("DATA/laws_parsed.json", "r", encoding = 'utf-8') as f:
        laws_parsed = json.load(f)
    law_emb = LawEmbeddings()
    laws_embedded = law_emb.create_embeddings(laws_parsed)
    laws_embedded = laws_embedded.astype(np.float32)
    law_emb.save_embeddings(laws_embedded, "DATA/laws_embedded.npy")

In [3]:
law_emb = LawEmbeddings()
with open("DATA/laws_parsed.json", "r", encoding = 'utf-8') as f:
    laws_parsed = json.load(f)
laws_embedded = law_emb.create_embeddings(laws_parsed)
laws_embedded = laws_embedded.astype(np.float32)
np.save("DATA/laws_embedded.npy", laws_embedded)

Batches: 100%|██████████| 154/154 [1:35:08<00:00, 37.07s/it]


In [4]:
print("임베딩 전 : ", laws_parsed[0]['embedding_text'])
print("임베딩 후 : ", laws_embedded)

임베딩 전 :  개인정보 보호법 제1장 총칙 제1조(목적) 이 법은 개인정보의 처리 및 보호에 관한 사항을 정함으로써 개인의 자유와 권리를 보호하고, 나아가 개인의 존엄과 가치를 구현함을 목적으로 한다.
임베딩 후 :  [[ 0.01513317  0.00820284 -0.04576007 ...  0.03311426  0.02882182
  -0.03209966]
 [ 0.01860131  0.01382969 -0.05322434 ... -0.00870176  0.01544493
  -0.04907218]
 [ 0.01449217  0.0214253  -0.04383089 ... -0.01569305  0.01975073
  -0.03497569]
 ...
 [-0.0227614   0.05201114 -0.02038439 ...  0.00708049  0.01217214
   0.0032482 ]
 [ 0.00994824  0.02448714 -0.02236893 ...  0.03084159 -0.02370246
  -0.05230789]
 [ 0.0491998   0.04783437 -0.022372   ...  0.04004373 -0.01952593
   0.00499811]]


In [5]:
embedded = np.load("DATA/laws_embedded.npy")
len(embedded)

19620

In [6]:
with open("DATA/laws_parsed.json", "r", encoding = 'utf-8') as f:
    laws_parsed = json.load(f)
len(laws_parsed)

19620