In [2]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m83.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_

### 모듈 임포트

In [84]:
import os
from zipfile import ZipFile
import re
import pickle
import random
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import math
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sklearn.metrics.pairwise import cosine_similarity

### 데이터 압축 해제

In [8]:
data_path = '/content/drive/MyDrive/data/json_seperatly-saved_in_csv1/json_data.zip'
extract_path = '/content/data'

with ZipFile(data_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

### json_list 변환

In [12]:
folder_path = '/content/data/json_data'

file_names = os.listdir(folder_path)

data_list = []
for file_name in file_names:
  if file_name.startswith("processed_") and file_name.endswith(".pickle"):
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'rb') as file:
      data = pickle.load(file)
      data_list.append(data)

### text_list 변환

In [23]:
text_list = []
for i in range(len(data_list)):
  text = ""
  for j in range(len(data_list[i]['form'])):
    if type(data_list[i]['form'][j]['text']) == str:
      text += " " + data_list[i]['form'][j]['text']
  text_list.append(text)

### keyword_list 변환

In [42]:
def is_valid_keyword(keyword):
  if keyword.isdigit():
    return False
  return True

keyword_list = []
keyword_vocab = {}
for i in range(len(data_list)):
  keyword = data_list[i]['keyword']
  keyword = [word for word in keyword if is_valid_keyword(word)]
  keyword_list.append(keyword)
  for j in range(len(keyword)):
    if keyword[j] in keyword_vocab:
      keyword_vocab[keyword[j]] += 1
    else:
      keyword_vocab[keyword[j]] = 1

### pair 생성 함수

In [48]:
def get_positive_pairs(text, keyword):
  positive_pairs = []
  for i in range(len(keyword)):
    positive_pairs.append((text, keyword[i]))
  return positive_pairs

keywords = list(keyword_vocab.keys())
frequencies = list(keyword_vocab.values())

def get_negative_pairs(text, keyword):
  negative_pairs = []
  for i in range(len(keyword)):
    keyword = random.choices(keywords, weights=frequencies, k=1)[0]
    while keyword in keyword_list:
      keyword = random.choices(keywords, weights=frequencies, k=1)[0]
    negative_pairs.append((text, keyword))
  return negative_pairs

### positive pairs, negative pairs 정의

In [54]:
positive_pairs = []
negative_pairs = []

for i in tqdm(range(len(data_list))):
  text = text_list[i]
  keyword = keyword_list[i]
  positive_pairs.extend(get_positive_pairs(text, keyword))
  negative_pairs.extend(get_negative_pairs(text, keyword))

100%|██████████| 29590/29590 [07:53<00:00, 62.43it/s]


### Concatenate하여 pairs, labels 정의

In [57]:
pairs = positive_pairs + negative_pairs
labels = [1]*len(positive_pairs) + [0]*len(negative_pairs)

In [126]:
# model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# sample_data = list(zip(pairs, labels))

# train_data, test_data = train_test_split(sample_data, test_size=0.2, random_state=42)

train_examples = [InputExample(texts=[data[0][0], data[0][1]], label=float(data[1])) for data in train_data]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=128)

train_loss = losses.CosineSimilarityLoss(model)

num_epochs = 5
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps)
model.save('sbert_keyword_extractor_2023_06_30')

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7015 [00:00<?, ?it/s]

In [98]:
keyword_embedding_list = []

for keyword in tqdm(keyword_vocab.keys()):
    keyword_embedding_list.append((keyword, model.encode(keyword)))

100%|██████████| 14287/14287 [01:58<00:00, 120.22it/s]


In [96]:
def get_top_keyword(sentence, top_k = 3):
    sentence_embedding = model.encode(sentence)
    keyword_score_list = []
    for keyword, keyword_embedding in keyword_embedding_list:
        score = cosine_similarity([sentence_embedding], [keyword_embedding])[0][0]
        keyword_score_list.append((keyword, score))
    keyword_score_list.sort(key=lambda x: x[1], reverse=True)
    return keyword_score_list[:top_k]

In [116]:
sentence = '''
UDOP 발표
UDOP이란 무엇인가
UDOP의 역사와 성질
발표자: 김상현
'''
get_top_keyword(sentence, 10)

[('history', 0.8400927),
 ('시대', 0.8330672),
 ('정치인', 0.8019314),
 ('정치', 0.7957271),
 ('사건', 0.7913714),
 ('왕조', 0.79060066),
 ('근대', 0.7896094),
 ('현대의', 0.7847676),
 ('사건현장', 0.78446245),
 ('병력', 0.78344625)]

In [121]:
%cd /content/drive/MyDrive/model

/content/drive/MyDrive/model
