In [1]:
import json

with open('./data/products.json', 'r') as file :
    data = json.load(file)

In [2]:
from pykospacing import Spacing
from tqdm import tqdm

spacing = Spacing()
product_list = [] # dictionary list {company, product}
for instance in tqdm(data) :
    name = instance['name']
    p = name.find(')')
    company = name[:p]
    product = spacing(name[p+1:], ignore='none')

    product_list.append({'company': company,
                         'product': product})



100%|██████████| 3272/3272 [05:26<00:00, 10.02it/s]


In [3]:
product_list[0]

{'company': 'Y(P', 'product': '육즙 팡팡 미니치즈비엔나 140G'}

In [4]:
from rank_bm25 import BM25Okapi
import json


corpus = []
for instance in product_list :
    corpus.append(instance['product'])

def tokenizer(sent):
  return sent.split(" ")

tokenized_corpus = [tokenizer(doc) for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

In [7]:
from sentence_transformers import SentenceTransformer

query = "도시락"
tokenized_query = tokenizer(query)

results = bm25.get_top_n(tokenized_query, corpus, n=10)
print(results)


from sentence_transformers import CrossEncoder
model = CrossEncoder('bongsoo/kpf-cross-encoder-v1')
scores = model.predict([('제육 도시락', _ ) for _ in results])

l = []
for i, name in enumerate(results):
    l.append({'score': scores[i],
              'name': name})

score_name = sorted(l, key=(lambda x: x['score']), reverse=True)

print(score_name)

['도시락', '돈까스&불백 5찬 도시락', 'NEW의성마늘햄 쌈 도시락', '멘츠카츠&제육 5찬 도시락', '함박&부대 볶음 5찬 도시락', '한 끼 7찬 도시락', '가득 담은 5찬 도시락', '매콤닭다리 7찬 도시락 2.0', '한 끼 경양식 도시락', '한 끼 11찬 도시락']


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'score': 0.66687495, 'name': '멘츠카츠&제육 5찬 도시락'}, {'score': 0.5455696, 'name': '도시락'}, {'score': 0.5074965, 'name': '가득 담은 5찬 도시락'}, {'score': 0.48636338, 'name': '한 끼 경양식 도시락'}, {'score': 0.40255937, 'name': '한 끼 7찬 도시락'}, {'score': 0.38565454, 'name': '한 끼 11찬 도시락'}, {'score': 0.2665481, 'name': '함박&부대 볶음 5찬 도시락'}, {'score': 0.24097888, 'name': '매콤닭다리 7찬 도시락 2.0'}, {'score': 0.24039179, 'name': '돈까스&불백 5찬 도시락'}, {'score': 0.16403937, 'name': 'NEW의성마늘햄 쌈 도시락'}]


### Part of Speech indexing

In [3]:
from konlpy.tag import Okt

test_idx = 4

okt = Okt()
malist=okt.pos(data[test_idx]['name'], norm=True, stem=True)

processed_list = []
for instance in malist :
    value = instance[0]
    word_type = instance[1]
    if word_type=='Noun' or word_type=='Adverb' :
        processed_list.append(value)
    
print(data[test_idx]['name'])
print(malist)
print(processed_list)

뉴트리플랜)디쉬(참치&닭가슴살)
[('뉴', 'Noun'), ('트리', 'Noun'), ('플랜', 'Noun'), (')', 'Punctuation'), ('디쉬', 'Noun'), ('(', 'Punctuation'), ('참치', 'Noun'), ('&', 'Punctuation'), ('닭', 'Noun'), ('가슴', 'Noun'), ('살', 'Noun'), (')', 'Punctuation')]
['뉴', '트리', '플랜', '디쉬', '참치', '닭', '가슴', '살']


In [4]:
from konlpy.tag import Okt

okt = Okt()

for instance in data :
    name = instance['name']

### Spacing Example

In [9]:
from pykospacing import Spacing
from tqdm import tqdm

spacing = Spacing()

spaced_names = []
for instance in tqdm(data):
    spaced_names.append(spacing(instance['name'], ignore='none'))

100%|██████████| 3272/3272 [02:23<00:00, 22.83it/s]


In [17]:
with open('./results/spaced_names.json', 'w', encoding='UTF-8') as file :
    json.dump(spaced_names, file, indent='\t', ensure_ascii=False)