In [1]:
# read in all needed data

def get_test_data(filename):
    test_data = []
    with open(filename) as f:
        for line in f:
            line = line.split('\t')
            test_word = line[0]
            sentence_id = line[1]
            test_data.append([test_word, sentence_id])
    return(test_data)

In [2]:
def get_candidates(filename):
    with open(filename) as f:
        candidate_dict = {}
        for line in f:
            line = line.split('::')
            word = line[0]
            candidates = line[1].split(';')
            candidate_dict[word] = candidates
        return candidate_dict

In [3]:
import torch.nn as nn
# needs to be here to properly load the model - same as in Practical2_skipgram.ipynb
class skipgram(nn.Module):
    def __init__(self, vocab_size, emb_dimension):
        super(skipgram, self).__init__()
        self.vocab_size = vocab_size
        self.emb_dimension = emb_dimension
        # start with random embeddings
#         self.W_embeddings = torch.randn((vocab_size, emb_dimension), requires_grad=True)
#         self.C_embeddings = torch.randn((vocab_size, emb_dimension), requires_grad=True)
        self.W_embeddings = nn.Embedding(vocab_size, emb_dimension)
        self.C_embeddings = nn.Embedding(vocab_size, emb_dimension)
        
    def forward(self, word, pos_context, neg_contexts):
        # word, pos_context and neg_contexts are integers, so can just pick that row from W and C matrices
        word_embed = self.W_embeddings(word)
        pos_embed = self.C_embeddings(pos_context)
        neg_embeds = self.C_embeddings(neg_contexts)
        
        pos_similarity = torch.mul(word_embed, pos_embed).squeeze()
        pos_sum = torch.sum(pos_similarity, dim=1)
        pos_logsig = nn.functional.logsigmoid(pos_sum)
        pos_score = sum(pos_logsig)
        
        neg_similarity = torch.bmm(neg_embeds, word_embed.unsqueeze(2)).squeeze()
        neg_sum = torch.sum(neg_similarity, dim=1)
        neg_logsig = nn.functional.logsigmoid(-1 * neg_sum)
        neg_score = sum(neg_logsig)
        
        loss = -(pos_score+neg_score)
        
        return loss

In [4]:
import pickle
import torch

test_data = get_test_data('lst/lst_test.preprocessed')

candidates = get_candidates('lst/lst.gold.candidates')

with open('w2i_skipgram.pkl', 'rb') as f:
    w2i = pickle.load(f)

with open('i2w_skipgram.pkl', 'rb') as f:
    i2w = pickle.load(f)
    


In [5]:
use_cuda = torch.cuda.is_available()

if use_cuda: # load model with gpu (as it was trained)
    skipgram_model = torch.load('skipgram.pt')
else: # convert to cpu:
    skipgram_model = torch.load('skipgram.pt',  map_location='cpu')

In [6]:
# preprocessing - take out any words/candidates aren't present in original corpus (w2i check)  
orig_test_data = test_data[:]
for [word, sentence_id] in test_data[:]:
    word_nopos = word[:-2]
    if word_nopos not in w2i:
        test_data.remove([word, sentence_id])
    for candidate in candidates[word][:]:
        if candidate not in w2i:
            candidates[word].remove(candidate)

In [7]:
# convert words to indexes

word_is = [w2i[word[:-2]] for word, _ in test_data]
word_is = torch.LongTensor(word_is)

can_is = {}
for [word, _] in test_data[:]:
    can_i = [w2i[can] for can in candidates[word]]
    word_i = w2i[word[:-2]]
    can_is[word_i] = torch.LongTensor(can_i)

In [8]:
def get_ranking(word, candidates):

    cos = torch.nn.CosineSimilarity(dim=0)
    
    if use_cuda:
        word = word.cuda()

    word_embedding = skipgram_model.W_embeddings(word)
    can_sims = []
    for can in candidates:
        if use_cuda:
            can = can.cuda()
        can_embedding = skipgram_model.W_embeddings(can)
        sim = cos(word_embedding, can_embedding)
        can_sims.append([can, sim])
            
    can_sims = sorted(can_sims, key = lambda x: x[1])
    return(can_sims)

In [9]:
results = {}
for i, [word, sentence_id] in enumerate(test_data):
    word_i = word_is[i]
    cans = can_is[word_i.item()]
    rank = get_ranking(word_i, cans)
    words_scores = []
    for [candidate, score] in rank:
        can_word = i2w[candidate.item()]
        score = score.item()
        words_scores.append([can_word, score])
    results[word] = words_scores

In [10]:
with open('skipgram_predictions', 'w') as f:
    for [word, sentence_id] in orig_test_data: # lst_gap needs all words, also once we do not have data for
        f.write('#RANKED\t')
        f.write(word + ' ')
        f.write(sentence_id)
        if word in results:
            for [candidate, score] in results[word]:
                f.write('\t' + candidate + ' ' + str(score))
        f.write('\n')

In [11]:
%run lst/lst_gap.py lst/lst_test.gold skipgram_predictions skipgram_out no-mwe


MEAN_GAP	0.2577320636314508

