In [2]:
import gensim
from gensim.models.doc2vec import TaggedDocument

In [2]:
!cat "../data/d2v.hscode.sample"

0100|Live,animals	
0200|Meat,meat,offal	
0300|Fish,crustaceans,molluscs,aquatic,invertebrates	
0400|Dairy,eggs,honey,animal	
0500|animal,origin
0600|trees,plants,bulbs,roots,flowers,ornamental,foliage.	
0700|vegetables,roots,tubers	
0800|fruit,nuts,peel,citrus,fruit,melons	
0900|Coffee,tea,mate and spices	
1000|Cereals

## Train Mockup

In [3]:
tagged_docs = []
with open("../data/d2v.hscode.sample", "r") as f:
    for line in f.readlines(): 
        for dup in range(10000):
            line = line.strip()
            hscode, words = line.split("|")
            words = list(set([word.lower() for word in words.split(",")]))
#             print(f"HSCODE : {hscode} | WORDS : {words}")
            tagged_docs.append(TaggedDocument(words=words, tags=[str(dup)+hscode]))

In [4]:
len(tagged_docs)

100000

In [5]:
DVmodel = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=1)
DVmodel.build_vocab(tagged_docs)
DVmodel.train(tagged_docs, total_examples=len(tagged_docs), epochs=500)
DVmodel.save("../models/test_model_0628/my_model")

### Load Model

In [4]:
from gensim.models.doc2vec import Doc2Vec
def load_model(path):
    return Doc2Vec.load(path)
pretrained_d2v = load_model("../models/test_model_0628/my_model")

### Get Vectors

In [5]:
def get_vector(model, key):
    return model.docvecs[key]

def get_doctag_and_vectors(model, check=False):
    doctags = model.docvecs.index2entity
    vectors = model.docvecs.vectors_docs
    if check == True:
        for _index, _id in enumerate(doctags):
            assert np.array_equal(model.docvecs[_id], vectors[_index]), "NONO"
    return doctags, vectors


In [6]:
doctags, all_vectors = get_doctag_and_vectors(pretrained_d2v, check = True)

## Split Tags to Two sets

In [7]:
def split_tags_with_two_set(model, a_tags, b_tags=None):

    def _get_doctag_and_vector(model, tags):        
        _doctags = model.docvecs.index2entity
        _doctag_index = [_doctags.index(tag) for tag in tags]
        _vectors = model.docvecs.vectors_docs[_doctag_index]
        return tags, _vectors
    
    a_doctags, a_vectors = _get_doctag_and_vector(model, a_tags)
    
    if b_tags == None:
        b_tags = [b_tag for b_tag in model.docvecs.index2entity if b_tag not in a_doctags]
    
    b_doctags, b_vectors = _get_doctag_and_vector(model, b_tags)

    return a_doctags, a_vectors, b_doctags, b_vectors

In [170]:
q_tag = ['0100','0400']
q_tag, q_vec, v_tag, v_vec = split_tags_with_two_set(pretrained_d2v, q_tag)

In [171]:
q_tag, v_tag

(['0100', '0400'],
 ['0200', '0300', '0500', '0600', '0700', '0800', '0900', '1000'])

In [172]:
get_cosine_matrix(q_vec, v_vec)

array([[0.9571379 , 0.97786653, 0.9517958 , 0.97486556, 0.95984334,
        0.96753204, 0.9672515 , 0.9509969 ],
       [0.9750438 , 0.98059654, 0.9755604 , 0.98962146, 0.97363627,
        0.9829849 , 0.97819394, 0.94849193]], dtype=float32)

## Get cosine Matrix
- it is totally same 'docvecs.most_similar'

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
def get_cosine_matrix(a, b):
    return cosine_similarity(a, b)

In [9]:
import time
st = time.time()
cos_mat = get_cosine_matrix(all_vectors, all_vectors)
print(time.time() - st)

13.091769933700562


In [10]:
cos_mat.shape

(100000, 100000)

In [18]:
st = time.time()
np.argsort(cos_mat[0], axis=-1)
print((time.time()-st)*len(cos_mat))

958.1089019775391


## Transfer Learning

In [48]:
from gensim.models import KeyedVectors
loaded_model = KeyedVectors.load_word2vec_format("../models/pretrained_w2v/GoogleNews-vectors-negative300-SLIM.bin", binary=True) 

## Plain

In [66]:
import copy
d2v_plain = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=1)
d2v_plain.build_vocab(tagged_docs)
d2v_plain.train(tagged_docs, total_examples=len(tagged_docs), epochs=5000)

## Transfer

In [74]:
transfer_d2v = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=1)
transfer_d2v.build_vocab(tagged_docs)

def init_w2v_to_d2v(d2v_model, w2v_model):
    for k in d2v_model.wv.index2word:
        try:
            transfer_d2v.wv[k] = w2v_model[k]
        except Exception as e:
            print(e)

init_w2v_to_d2v(transfer_d2v, loaded_model)
transfer_d2v.wv['animal']
transfer_d2v.train(tagged_docs, total_examples=len(tagged_docs), epochs=5000)

"word 'foliage.' not in vocabulary"
"word 'mate and spices' not in vocabulary"


In [62]:
!cat "../data/d2v.hscode.sample"

0100|Live,animals	
0200|Meat,meat,offal	
0300|Fish,crustaceans,molluscs,aquatic,invertebrates	
0400|Dairy,eggs,honey,animal	
0500|animal,origin
0600|trees,plants,bulbs,roots,flowers,ornamental,foliage.	
0700|vegetables,roots,tubers	
0800|fruit,nuts,peel,citrus,fruit,melons	
0900|Coffee,tea,mate and spices	
1000|Cereals

In [75]:
for model in [d2v_plain, transfer_d2v]:
    a_tag, a_vec, b_tag, b_vec = split_tags_with_two_set(model, ['0800'])
    _index = np.argsort(get_cosine_matrix(a_vec, b_vec), axis=-1)[0]
    print(a_tag)
    print([b_tag[_i] for _i in _index])

['0800']
['0600', '0700', '0300', '0500', '0900', '0400', '0200', '0100', '1000']
['0800']
['0300', '0500', '0600', '0700', '0900', '0400', '0200', '0100', '1000']


In [17]:
import numpy as np
t = np.array([1]*12)
print(t)
def __split_indices(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

[1 1 1 1 1 1 1 1 1 1 1 1]


In [18]:
for i in __split_indices(range(len(t)), 10):
    print(t[i])

[1 1]
[1 1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]


In [38]:
import random
with open("/home/jack/.bashrc", "r") as f:
    print(len(f.readline()))
    lines = random.sa.mple(f.readlines(),5)

55


In [37]:
!wc -l "/home/jack/.bashrc"

136 /home/jack/.bashrc


In [30]:
[_lines.strip() for _lines in lines]

['데모 영상으로 제작된 버전은 다음 레파지토리에 있습니다. <br> https://github.com/BM-K/KoSentenceBERT_V2',
 "'치타가 들판을 가로 질러 먹이를 쫓는다.']",
 'ETRI KorBERT는 transformers 2.4.1 ~ 2.8.0에서만 동작하고 Sentence-BERT는 3.1.0 버전 이상에서 동작하여 라이브러리를 수정하였습니다. <br>',
 '```',
 '']

In [39]:
count = len(open("/home/jack/.bashrc").readlines(  ))

In [40]:
count

136