In [None]:
## Korean Word2Vec finetuning
import gensim
import numpy as np
ko_wv_model = gensim.models.Word2Vec.load("./ko-w2v/ko.bin")
print(ko_wv_model.wv.vector_size)
print(ko_wv_model.corpus_count)
print(len(ko_wv_model.wv.vocab))


In [None]:
vocabs = ko_wv_model.wv.vocab
idx_to_count = np.asarray([vocab.count for vocab in sorted(vocabs.values(), key=lambda x:x.index)])
vocabs

In [None]:
word="도배"
try:
    ko_wv_model.wv.most_similar(word)
except:
    print("{}: 없음".format(word))

In [None]:
import numpy as np
import re
import pandas as pd
import scipy.sparse as ss
import matplotlib.pyplot as plt
%matplotlib inline
#Preprocessing 
import MeCab
import nltk
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
from nltk.tokenize import word_tokenize
from nltk import FreqDist 
from pprint import pprint
from tqdm import tqdm

from konlpy.tag import Mecab 
from konlpy.tag import *

mc = Mecab(dicpath='C:\mecab\mecab-ko-dic') # The path of the MeCab-ko dictionary.
m = MeCab.Tagger("-O wakati")

class MyTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    
    def __call__(self, sent):
        postags=['NNP', 'NNG', 'VV', 'VA','SL']
        pos = self.tagger.pos(sent)
        pos = [word for (word, pos) in mc.pos(sent, flatten=True) if pos in postags and len(word)>1]
        return pos

my_tokenizer = MyTokenizer(Mecab(dicpath='C:\mecab\mecab-ko-dic'))

# Defect Data load 
dldefect_df = pd.read_excel('defect dataset file ')
rawComplaints = dldefect_df.apply(lambda row: " ".join(re.sub("[^a-zA-Z가-힣]+"," ", row.complaint).split()), 1).to_list()
print(len(rawComplaints))
dldefect_df

In [None]:
rawComplaints = list(set(rawComplaints))
print("Remove duplicates: ",len(rawComplaints))

tokenized_text = [my_tokenizer(line) for line in rawComplaints]
print(len(tokenized_text))


In [None]:
from gensim.models import Word2Vec, KeyedVectors

#1) LOAD pre-trained key vector
ko_model_keyVector = KeyedVectors.load_word2vec_format("./ko-w2v/ko.bin.gz", binary =False)
model_2 = Word2Vec(size=ko_model_keyVector.vector_size, min_count=1, sg=1)

In [None]:
#3) Build a new model's vocabualry first
a = model_2.build_vocab(tokenized_text)
total_examples = model_2.corpus_count
print(total_examples)

In [None]:
len(model_2.wv.vocab)

In [None]:
len(list(ko_model_keyVector.wv.vocab.keys()))

In [None]:
#4) BUILD vocab by PreTrainedKeyvector word Vocabulary
# model_2.build_vocab([[]]) # list of list 
model_2.build_vocab([list(ko_model_keyVector.wv.vocab.keys())], update=True)    


In [None]:
#5) INITIALIZED word vector 
model_2.intersect_word2vec_format("./ko-w2v/ko.bin.gz", binary=False, lockf=1.0,  encoding='ISO-8859-1')


In [None]:
len(model_2.wv.vocab)

In [None]:
#6) Train new dataset 
print(model_2.wv['도배'][:5])
model_2.train(tokenized_text, total_examples=total_examples, epochs=model_2.iter)
print(model_2.wv['도배'][:5])

In [None]:
model_2.most_similar("도배")

In [None]:
model_2.most_similar("미장")

In [None]:
print(model_2.wv.vector_size)
print(model_2.corpus_count)
print(len(model_2.wv.vocab))

In [None]:
print(len(model_2.wv.vocab))

vocabs = model_2.wv.vocab
idx_to_count = np.asarray([vocab.count for vocab in sorted(vocabs.values(), key=lambda x:x.index)])
idx_to_count

In [None]:
addedWordNum =0
for k in model_2.wv.vocab:
    result = ko_wv_model.wv.vocab.get(k, "0")
    if result=="0":
        addedWordNum+=1
        print(k, end=",")
print("number of added words:", addedWordNum)

In [None]:
#save fine-tuned word2vec mode
model_2.save("./ko-w2v/ko_w2v_defect-0710-v2.bin")