## KoBERT를 이용하여 판결문 데이터를 벡터 임베딩으로 변환

### Train, Validation

In [2]:
# KoBERT Github 이용하여 텍스트 데이터 임베딩 생성
import json
import tqdm
import torch
import pickle

from transformers import AutoTokenizer, AutoModel

# Data 불러오기
emb_dict = {}
data_dict = json.load(open('data/trainval.json', 'r', encoding='utf-8'))

# KoBERT 호출
model_name = 'monologg/kobert'
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

device = torch.device('cuda:0')
model = model.to(device)
model.eval()

for domain in ['train', 'validation']:
    emb_dict[domain] = []

    for data in tqdm.tqdm(data_dict[domain]):
        first_party = data['The first party']
        second_party = data['The second party']
        facts = data['facts'].replace('\n', ' ')
        
        embeddings = []

        for input_data in [first_party, second_party, facts]:
            encoded_input = tokenizer([input_data], padding=True, truncation=True, return_tensors='pt').to(device)
            with torch.no_grad():
                model_output = model(**encoded_input)
                embedding = model_output.pooler_output[0].cpu().detach().numpy()  # 또는 model_output.last_hidden_state.mean(dim=1)로 선택 가능
        
            embeddings.append(embedding)

        emb_dict[domain].append(
            {
                'first_party': embeddings[0],
                'first_party_name': first_party,

                'second_party': embeddings[1],
                'second_party_name': second_party,

                'facts': embeddings[2],
                'output': data['output']
            }
        )
    
    # 임베딩 저장
    pickle.dump(emb_dict, open('embeddings/trainval.json'.replace('.json', f'_KoBERT.pkl'), 'wb'))  

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
100%|███████████████████████████████████████████████████████████████████████████████████████████| 2760/2760 [01:38<00:00, 27.90it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 1379/1379 [00:50<00:00, 27.28it/s]


### Test

In [3]:
emb_dict = []
data_dict = json.load(open('./data/test.json', 'r', encoding='utf-8'))

for data in tqdm.tqdm(data_dict):
    first_party = data['The first party']
    second_party = data['The second party']
    facts = data['facts'].replace('\n', ' ')
    
    embeddings = []

    for input_data in [first_party, second_party, facts]:
        encoded_input = tokenizer([input_data], padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            model_output = model(**encoded_input)
            embedding = model_output.pooler_output[0].cpu().detach().numpy()  # 또는 model_output.last_hidden_state.mean(dim=1)로 선택 가능
    
        embeddings.append(embedding)

    emb_dict.append(
        {
            'test_id': data['test_id'],

            'first_party': embeddings[0],
            'first_party_name': first_party,

            'second_party': embeddings[1],
            'second_party_name': second_party,

            'facts': embeddings[2],
            'output': data['output']
        }
    )

pickle.dump(emb_dict, open('./embeddings/test.json'.replace('.json', f'_KoBERT.pkl'), 'wb'))

100%|███████████████████████████████████████████████████████████████████████████████████████████| 1035/1035 [00:35<00:00, 28.95it/s]
