In [2]:
import json

with open('./data/products.json', 'r') as file :
    data = json.load(file)

data[0]

{'crawled_info': {'spider': 'gs25web',
  'id': 'GD_8809038475901_001',
  'url': 'http://gs25.gsretail.com/products/youus-freshfoodDetail-search?CSRFToken=04e9fb45-f509-43df-8ade-c86665b4d18d'},
 'created_at': 1693965848,
 'description': None,
 'events': [{'brand': 1, 'id': 2},
  {'brand': 1, 'id': 4},
  {'brand': 1, 'id': 5}],
 'image': {'thumb': 'https://image.woodongs.com/imgsvr/item/GD_8809038475901_001.jpg',
  'others': []},
 'name': 'Y(P)육즙팡팡미니치즈비엔나140G',
 'price': {'value': 4200.0, 'currency': 1},
 'updated_at': 1694460503,
 'category': 4,
 'tags': [],
 'discounted_price': None}

In [3]:
item_name = []
for instance in data :
    item_name.append(instance['name'])

In [6]:
from sentence_transformers import SentenceTransformer
sroberta_multitask = SentenceTransformer('jhgan/ko-sroberta-multitask')
#sbert_nli = SentenceTransformer('jhgan/ko-sbert-nli')
#sroberta_nli = SentenceTransformer('jhgan/ko-sroberta-nli')
#sroberta_sts = SentenceTransformer('jhgan/ko-sroberta-sts')

sroberta_multitask._save_checkpoint('./models/sroberta_multitask.pt', checkpoint_save_total_limit=None, step='0')

In [3]:
import numpy as np
from tqdm import tqdm

## model : sentence embedding model
## item_list : item names
## return : [{embedding, name}, ..]
def make_embedding(model, item_list) :
    embeddings = []
    for name in tqdm(item_list) :
        embeddings.append({'embedding': model.encode(name), 'name': name})
    return embeddings

def find_item(model, query, embedding_list) :
    query_embedding = model.encode(query)

    score_name = []
    for instance in embedding_list :
        score = np.dot(query_embedding, instance['embedding'])
        score_name.append({'score': score, 'name': instance['name']})

    score_name = sorted(score_name, key=(lambda x: x['score']), reverse=True)

    return score_name[:10]

In [6]:
sroberta_multitask_embedding = make_embedding(sroberta_multitask, item_name)
sbert_nli_embedding = make_embedding(sbert_nli, item_name)
sroberta_nli_embedding = make_embedding(sroberta_nli, item_name)
sroberta_sts_embedding = make_embedding(sroberta_sts, item_name)

100%|██████████| 3272/3272 [02:56<00:00, 18.58it/s]
100%|██████████| 3272/3272 [03:22<00:00, 16.15it/s]
100%|██████████| 3272/3272 [03:25<00:00, 15.94it/s]
100%|██████████| 3272/3272 [03:25<00:00, 15.94it/s]


In [12]:
import numpy as np
np.save('./data/sroberta_multitask.npy', sroberta_multitask_embedding)
np.save('./data/sbert_nli.npy', sbert_nli_embedding)
np.save('./data/sroberta_nli.npy', sroberta_nli_embedding)
np.save('./data/sroberta_sts_embedding.npy', sroberta_sts_embedding)


In [11]:
sroberta_multitask_embedding = np.load('./data/sroberta_multitask.npy', allow_pickle=True)
sbert_nli_embedding = np.load('./data/sbert_nli.npy', allow_pickle=True)
sroberta_nli_embedding = np.load('./data/sroberta_nli.npy', allow_pickle=True)
sroberta_sts_embedding = np.load('./data/sroberta_sts_embedding.npy', allow_pickle=True)


In [12]:
query = "스팸이 들어간 김밥"
model_list = [sroberta_multitask, sbert_nli, sroberta_nli, sroberta_sts]
embedding_list = [sroberta_multitask_embedding, sbert_nli_embedding, sroberta_nli_embedding, sroberta_sts_embedding]
for model, embeddings in zip(model_list, embedding_list) :
    print(find_item(model, query=query, embedding_list=embeddings))

[{'score': 119.61357, 'name': '푸드)스팸간장덮밥'}, {'score': 112.52594, 'name': '그린)스팸김치볶음김밥'}, {'score': 105.87346, 'name': '풀)스팸김치볶음밥1편'}, {'score': 105.18835, 'name': 'CJ)스팸김치볶음밥220g'}, {'score': 104.25098, 'name': '스팸매일함)볶음김치김밥1'}, {'score': 103.165924, 'name': 'CJ)스팸김치볶음밥220G'}, {'score': 100.99123, 'name': '스팸무스비)볶음김치참치'}, {'score': 99.23768, 'name': 'CJ)스팸김치덮밥(컵밥)'}, {'score': 97.01791, 'name': '스팸무스비)강황밥참치마요'}, {'score': 96.73009, 'name': 'CJ)스팸340g'}]
[{'score': 408.2221, 'name': '푸드)스팸간장덮밥'}, {'score': 391.71826, 'name': '스팸매일함)볶음김치김밥1'}, {'score': 383.2522, 'name': 'CJ)스팸200g'}, {'score': 382.23895, 'name': '그린)스팸김치볶음김밥'}, {'score': 378.60867, 'name': 'CJ)스팸340g'}, {'score': 376.7536, 'name': '스팸무스비)볶음김치참치'}, {'score': 369.0416, 'name': 'CJ)스팸김치덮밥(컵밥)'}, {'score': 366.92133, 'name': '풀)스팸김치볶음밥1편'}, {'score': 364.7212, 'name': '스팸무스비)강황밥참치마요'}, {'score': 363.9527, 'name': 'CJ)스팸김치볶음밥220g'}]
[{'score': 114.28873, 'name': '푸드)스팸간장덮밥'}, {'score': 112.78137, 'name': '스팸매일함)볶음김치김밥1'}, {'

In [4]:
from rank_bm25 import BM25Okapi
import json

with open('./data/products.json', 'r') as file :
    data = json.load(file)

corpus = []
for instance in data :
    corpus.append(instance['name'])

def tokenizer(sent):
  return sent.split(" ")

tokenized_corpus = [tokenizer(doc) for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

In [5]:
bm25.get_top_n('삼각김밥', corpus,n=10)

['뉴트리)비비랩콜라겐3입',
 '롯데)의성마늘후랑크70G',
 '대림)크라비아90G',
 '동원)가쓰오크랩스144G',
 '삼립)육즙가득로테부어스트70G',
 '롯데)키스틱55G',
 '롯데)빅팜60G',
 '롯데)의성마늘프랑크70G(2입)',
 '롯데)에센뽀득프랑크70G(2입)',
 '롯데)마늘앤뽀득프랑크140G']

In [1]:
from sentence_transformers import CrossEncoder
model = CrossEncoder('bongsoo/kpf-cross-encoder-v1')
scores = model.predict([('오늘 날씨가 좋다', '오늘 등산을 한다'), ('오늘 날씨가 흐리다', '오늘 비가 내린다')])
print(scores)

  from .autonotebook import tqdm as notebook_tqdm
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[0.29856136 0.708414  ]
