### Import Modules

In [1]:
import torch
import torch.nn as nn

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

import json
import numpy as np

### Load Model & Vocab

In [2]:
def loadVocab(save_root):
    with open(f'{save_root}/vocab.json', 'r') as json_file:
        vocab = json.load(json_file)

    return vocab

class Word2VecNS(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecNS, self).__init__()
        self.in_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.sigmoid = nn.Sigmoid()
                
    def forward(self, center_word_idx, target_word_idx):
        center_embed = self.in_embeddings(center_word_idx)
        target_embed = self.out_embeddings(target_word_idx)
        
        logits = torch.mul(center_embed, target_embed).sum(dim=1)
        output = self.sigmoid(logits)

        return output
    
    def get_embedding(self, word_idx):
        return self.in_embeddings(word_idx)

vocab = loadVocab('./dataset')
vocab_size = len(vocab)
print(f'Vocab Length: {len(vocab)}')

embed_size = 128
word2vec = Word2VecNS(vocab_size, embed_size)
word2vec.load_state_dict(torch.load('./dataset/(20240929)word2vec_NS(128h_4epoch).pt'))
word2vec.eval()
print(word2vec)

Vocab Length: 37690
Word2VecNS(
  (in_embeddings): Embedding(37690, 128)
  (out_embeddings): Embedding(37690, 128)
  (sigmoid): Sigmoid()
)


In [3]:
def extract_embeddings(model, vocab_size):
    word_indices = torch.arange(vocab_size)
    embeddings = []
    for word_idx in word_indices:
        embeddings.append(model.get_embedding(word_idx).detach().numpy())

    return np.array(embeddings)

embeddings = extract_embeddings(word2vec, vocab_size)
word_labels = [word for word in vocab]

In [12]:
def find_top_k_similar(embeddings, vocab, k=5):
    # 코사인 유사도 계산 (전체 임베딩 벡터 간의 유사도 계산)
    similarities = cosine_similarity(embeddings)
    
    # 결과 저장
    top_k_similar_words = {}
    
    # 각 단어에 대해 상위 k개의 유사한 단어 찾기
    for i, word in enumerate(vocab):
        # 유사도 순으로 정렬 (자기 자신은 제외)
        sorted_indices = np.argsort(-similarities[i])
        
        # 상위 k개 단어 찾기 (자기 자신은 제외하고 상위 k개)
        top_k_indices = sorted_indices[1:k+1]  # 첫 번째는 자기 자신이므로 제외
        top_k_words = [vocab[idx] for idx in top_k_indices]
        
        # 결과 저장
        top_k_similar_words[word] = top_k_words
    
    return top_k_similar_words

top_k_similar = find_top_k_similar(embeddings, word_labels, k=5)

In [40]:
test_words = [
    'drink',
    'food',
    'boy',
    'girl',
    'tool',
]

for test_word in test_words:
    print(f'{test_word:7}\'s top k similar: {top_k_similar[test_word]}')

drink  's top k similar: ['bottl', 'alcohol', 'coffe', 'wine', 'drank']
food   's top k similar: ['meal', 'suppli', 'eat', 'meat', 'drink']
boy    's top k similar: ['girl', 'youngster', 'ager', 'someon', 'babi']
girl   's top k similar: ['boy', 'ager', 'pregnant', 'child', 'babi']
tool   's top k similar: ['techniqu', 'invent', 'fit', 'method', 'len']
