In [2]:
import torch
import torch.nn.functional as F

In [5]:
# load model
model_path = '/root/autodl-tmp/Word2Vec-Implementation-with-PyTorch/models/skipgram-5e.pth'
device = 'cpu'
model = torch.load(model_path, weights_only=False, map_location=torch.device('cpu'))

In [6]:
from utils.dataloader import get_dataloader_and_tokenizer

dataloader, tokenizer = get_dataloader_and_tokenizer(1, 4, 50)

Load dataset


In [7]:
tokenizer.get_vocab_size()

4122

In [9]:
avg = 0
length = tokenizer.get_vocab_size()
for i in range(length):
    i = torch.tensor(i).to(device)
    a, b = model.V(i).unsqueeze(0), model.W(i).unsqueeze(0)
    avg += F.cosine_similarity(a,b).item()
avg / length

0.2127327043557479

In [None]:
# 结论一： V[a] !~ W[a]

In [16]:
def similarityVW(token1, token2):
    token1 = torch.tensor(tokenizer.token_to_idx(token1)).to(device)
    token2 = torch.tensor(tokenizer.token_to_idx(token2)).to(device)
    a, b = model.V(token1).unsqueeze(0), model.W(token2).unsqueeze(0)
    return F.cosine_similarity(a,b).item()

def similarityVV(token1, token2):
    token1 = torch.tensor(tokenizer.token_to_idx(token1)).to(device)
    token2 = torch.tensor(tokenizer.token_to_idx(token2)).to(device)
    a, b = model.V(token1).unsqueeze(0), model.V(token2).unsqueeze(0)
    return F.cosine_similarity(a,b).item()

def similarityWW(token1, token2):
    token1 = torch.tensor(tokenizer.token_to_idx(token1)).to(device)
    token2 = torch.tensor(tokenizer.token_to_idx(token2)).to(device)
    a, b = model.W(token1).unsqueeze(0), model.W(token2).unsqueeze(0)
    return F.cosine_similarity(a,b).item()

In [17]:
def most_similar_tokens(token, mode=0, topn=15): # mode = 0: VV else VW
    sims = {}
    token = tokenizer.token_to_idx(token)
    similarity = similarityVV if mode==0 else similarityVW if mode == 1 else similarityWW
    for i in range(tokenizer.get_vocab_size()):
        if i == token:
            continue
        sims[tokenizer.idx_to_token(i)] = similarity(tokenizer.idx_to_token(token), tokenizer.idx_to_token(i))
    topn_tokens = sorted(sims.items(), key=lambda x: x[1], reverse=True)[:topn]
    return topn_tokens

In [18]:
most_similar_tokens('king'), most_similar_tokens('king', mode=1), most_similar_tokens('king', mode=2)

([('emperor', 0.7604710459709167),
  ('monarch', 0.7218049168586731),
  ('reign', 0.7002407908439636),
  ('lord', 0.6984412670135498),
  ('son', 0.6885751485824585),
  ('successor', 0.6871017813682556),
  ('queen', 0.6797590255737305),
  ('frederick', 0.6781752705574036),
  ('brother', 0.6703252196311951),
  ('iv', 0.660991907119751),
  ('odaenathus', 0.6607653498649597),
  ('pope', 0.6564509868621826),
  ('archbishop', 0.6424189209938049),
  ('bishop', 0.6360291242599487),
  ('founder', 0.6351824998855591)],
 [("'s", 0.6255903244018555),
  ('of', 0.6003551483154297),
  ('the', 0.5648436546325684),
  ('to', 0.5593847632408142),
  ('his', 0.5588068962097168),
  ('<unk>', 0.5560933351516724),
  (',', 0.55524080991745),
  ('was', 0.5482189655303955),
  ('and', 0.5375607013702393),
  ('"', 0.5242669582366943),
  ('as', 0.5165198445320129),
  ('.', 0.5160472989082336),
  ('in', 0.5139936208724976),
  ('who', 0.5129943490028381),
  ('he', 0.5060199499130249)],
 [('henry', 0.5892033576965332)

In [30]:
n = 300
a = most_similar_tokens('king', mode=1, topn=n)
b = most_similar_tokens('monarch', mode=1, topn=n)
a = {i for i,j in a}
b = {i for i,j in b}
len(a&b)

231

In [44]:
most_similar_tokens('time')

[('plane', 0.612949013710022),
 ('remainder', 0.5949611663818359),
 ('wearing', 0.5851904153823853),
 ('point', 0.5847347378730774),
 ('1929', 0.5814406871795654),
 ('excavations', 0.5722891688346863),
 ('start', 0.5668984651565552),
 ('battle', 0.5657597184181213),
 ('pitched', 0.5612479448318481),
 ('appearing', 0.54886794090271),
 ('chorus', 0.5463493466377258),
 ('magadheera', 0.544417142868042),
 ('job', 0.542961061000824),
 ('period', 0.5426279902458191),
 ('loss', 0.5369955897331238)]

In [43]:
similarityVV('england', 'english')

0.3902110457420349