## Import

In [20]:
import os
import re
import json
import pandas as pd
import numpy as np

import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import scipy.spatial

## Dataset & Embedding

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('Chaeyeon/SBERT-added-patent-vocab')
model = AutoModel.from_pretrained('Chaeyeon/SBERT-added-patent-vocab')

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('zzzzzioni/added-patent-word-recog-tokenizer')
model = AutoModel.from_pretrained('zzzzzioni/added-patent-tokens-KR-SBERT-V40K-klueNLI-augSTS')

In [None]:
# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f'Using device: {device}')

Using device: cuda


In [None]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
def check_if_processed(file_path):
    """파일이 이미 '_new.csv'로 끝나는지 확인하여 처리된 파일인지 검사합니다."""
    return file_path.endswith('_new.csv')

In [None]:
from tqdm.auto import tqdm

def embed_cleaned_summary(df, model, tokenizer):
    embeddings = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Embedding rows"):
        text = str(row['cleaned_summary']).strip().lower()
        if text and text not in ['none', '요약 없음', 'null']:
            encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=512).to(device)
            with torch.no_grad():
                model_output = model(**encoded_input)
            sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
            # Convert the embedding to a list and append it to the embeddings list
            embeddings.append(sentence_embedding.squeeze().tolist())
        else:
            embeddings.append(None)  # You may choose to handle None differently

    # Instead of trying to assign a 2D array directly, assign a list of embeddings
    df['new_embeddings'] = embeddings
    return df

In [None]:
def update_csv_with_embeddings(input_folder, output_folder, model, tokenizer):
    # 출력 폴더가 없다면 생성
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    for file_name in tqdm(files, desc="Processing files"):
        input_file_path = os.path.join(input_folder, file_name)
        output_file_path = os.path.join(output_folder, file_name.replace('.csv', '_new.csv'))

        # 파일이 이미 처리되었는지 확인
        if os.path.exists(output_file_path):
            print(f"Skipping already processed file: {output_file_path}")
            continue

        df = pd.read_csv(input_file_path)
        updated_df = embed_cleaned_summary(df, model, tokenizer)
        updated_df.to_csv(output_file_path, index=False)
        print(f'Updated file saved as: {output_file_path}')

In [21]:
# 경로 지정
folder = '/content/drive/MyDrive/kpmg/kr_patent'
output_folder = '/content/drive/MyDrive/kpmg/processed_kr_patent'

In [None]:
update_csv_with_embeddings(folder, output_folder, model, tokenizer)