# Make predictions and get results

## Predict on test set

In [1]:
from allennlp.models.archival import archive_model
import pickle
import numpy as np
#archive_model("models/bigram_embedder_feedforw_l1_tanh")

  from ._conv import register_converters as _register_converters


In [2]:
from allennlp.models.archival import load_archive
from allennlp.service.predictors import Predictor

# required so that our custom model + predictor + dataset reader
# will be registered by name
import bigram_embedder

# with open("data/step5_holduot_splits/train_all_shuffled.pkl", 'rb') as f:
#     train_zip = pickle.load(f)
# with open("data/step5_holduot_splits/dev_all_shuffled.pkl", 'rb') as f:
#     dev_zip = pickle.load(f)
with open("data/step5_holduot_splits/test_all_shuffled.pkl", 'rb') as f:
    test_zip = pickle.load(f)
    

archive = load_archive('models/bigram_embedder_feedforw_l1_tanh/model.tar.gz')
predictor = Predictor.from_archive(archive, 'bigram-embedder')


In [3]:
with open("data/step5_holduot_splits/test_all_shuffled.tsv") as f:
    X_test = f.read().split('\n')

In [4]:
bigram_types_test = []
bigram_vecs_test = []
for l in X_test:
    bt, bv, _, _ = l.split('\t')
    bigram_types_test.append(bt)
    bigram_vecs_test.append([float(v) for v in bv.split()])

In [5]:
bigram_vecs_test_hat = []
for ex in X_test:
    ex = predictor.load_line(ex)
    bigram_vecs_test_hat.append(predictor.predict_json(ex).get("bigram_vecs_hat"))

## Compute error

In [6]:
import torch

In [7]:
def compute_smooth_l1_error(y, y_hat):
    y_var = torch.autograd.Variable(torch.FloatTensor(y))
    y_hat_var = torch.autograd.Variable(torch.FloatTensor(y_hat))
    
    if y_var.size() != y_hat_var.size():
        smooth_l1_error = torch.nn.SmoothL1Loss(reduce=False)
        y_var = y_var.expand(y_hat_var.size())
        return smooth_l1_error.forward(y_var, y_hat_var).sum(1).data

    else:
        smooth_l1_error = torch.nn.SmoothL1Loss()
        return smooth_l1_error.forward(y_var, y_hat_var).data[0]    

## Comput accuracy

In [8]:
def get_closest(target, vecs, n):
    sim_matrix = compute_smooth_l1_error(target, vecs)
    nvecs, nids = sim_matrix.sort(0)
    return nvecs[0:n], nids[0:n]

In [9]:
# vars
PATH_DATA_STEP3_FOLDER = "data/step3_fasttext_vectors/"
PATH_FASTTEXT_VECTORS = PATH_DATA_STEP3_FOLDER + "vectors_likelihood_ratio-100-0.05.vec"
SUFFIX = PATH_FASTTEXT_VECTORS.split('vectors_')[1][0:-4]
PATH_DATA_OUTPUT_FOLDER = "data/step4_wordpairs_bigrams_vec_data/" 

In [10]:
def get_all_ngram_vecs():
    # load fasttext .vec file containing unigrams and bigrams
    with open(PATH_FASTTEXT_VECTORS, 'r') as f:
        vectors_all = f.readlines()
        
    num_vectors_all, dim = vectors_all[0].split()
    num_vectors_all, dim = int(num_vectors_all), int(dim)
    
    print(num_vectors_all)
    print(dim)
    
    del vectors_all[0]
    
    ngram_types_all = []
    ngram_vectors_all = []
    for l in vectors_all:
        l = l.split()
        ngram_types_all.append(l[0])
        ngram_vectors_all.append([float(v) for v in l[1:]])
    return ngram_types_all, ngram_vectors_all

In [11]:
%%time
#all_bigrams, all_vecs, _ = zip(*train_zip + dev_zip + test_zip)
all_bigrams, all_vecs = get_all_ngram_vecs()

808968
100
CPU times: user 31 s, sys: 1.63 s, total: 32.6 s
Wall time: 32.8 s


In [12]:
%%time
NUM_NEIGHBOURS = 100
nearest_vecs_test = []
nearest_ids_test = []
for i, vec_hat in enumerate(bigram_vecs_test_hat):
    nvecs, nids = get_closest(vec_hat, all_vecs, NUM_NEIGHBOURS)
    nearest_vecs_test.append(nvecs)
    nearest_ids_test.append(nids)
    if i % 100 == 0:
        print(i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
CPU times: user 46min 51s, sys: 8min 1s, total: 54min 53s
Wall time: 54min 57s


In [13]:
predicted_bigrams = []
for i, ids in enumerate(nearest_ids_test):
    predicted_bigram_list = [all_bigrams[i] for i in ids]
    predicted_bigrams.append(predicted_bigram_list)

In [43]:
N = 5
corrects = []
corrects_groundtruth = []
errors = []
errors_groundtruth = [] 
for i, predicted_bigrams_list in enumerate(predicted_bigrams):
    if bigram_types_test[i] in predicted_bigrams_list[0:N]:
        corrects.append(predicted_bigrams_list)
        corrects_groundtruth.append(bigram_types_test[i])
    else:
        errors.append(predicted_bigrams_list)
        errors_groundtruth.append(bigram_types_test[i])
        
print('Top', N, 'accuaracy on test set:', len(corrects) / len(predicted_bigrams))

Top 5 accuaracy on test set: 0.7836


## Error analysis

In [15]:
import random

In [16]:
n = 10

In [17]:
random_n_ids = [random.randrange(0, len(errors)) for i in range(n)]

for ind in random_n_ids:
    ngrams = errors[ind]
    print(ind, '|', errors_groundtruth[ind], '->', ngrams[0:5])

318 | the_aqua -> ['the_ole', 'the_ea', 'the_pandemonium', 'the_brat', 'the_splash']
298 | page_don -> ['page_and', 'page_said', 'whatlinkshere', 'taylor_page', 'page']
66 | now_made -> ['now', 'now_making', 'previously_made', 'already_included', 'made_also']
255 | january_is -> ['december_is', 'february_is', 'november_is', 'september_is', 'october_is']
432 | of_accepted -> ['immediately_accepted', 'finally_accepted', 'following_accepted', 'which_accepted', 'accepted_with']
402 | party_six -> ['party_three', 'party_seven', 'party_five', 'three_party', 'party_two']
154 | by_mikhail -> ['and_leonid', 'chervenkov', 'prutkov', 'vladimir_and', 'morozevich']
201 | kb_to -> ['four_kib', 'seven_kb', 'mib_of', 'kb_and', 'mb_and']
152 | produced_when -> ['eventually_produced', 'produced_before', 'produced_that', 'produced_although', 'produced_after']
459 | supplies_a -> ['supplying_a', 'supplying_the', 'which_supplies', 'the_supplies', 'supplies_the']


In [19]:
#whos

In [19]:
# with open('data/nearest_vecs_test' + SUFFIX + '.pkl', 'wb') as f:
#     pickle.dump(nearest_vecs_test, f)
    
# with open('data/nearest_ids_test' + SUFFIX + '.pkl', 'wb') as f:
#     pickle.dump(nearest_ids_test, f)

MemoryError: 

## Make predictions

In [64]:
def pred_example(w1, w2, N=5, return_vec=False):
    vec1 = all_vecs[all_bigrams.index(w1)]
    vec2 = all_vecs[all_bigrams.index(w2)]
    x = {'w1_vec_str': ' '.join([str(v) for v in vec1]), 'w2_vec_str': ' '.join([str(v) for v in vec2])}
    bi_vec = predictor.predict_json(x)['bigram_vecs_hat']
    if return_vec:
        return bi_vec
    _, ids_hat = get_closest(bi_vec, all_vecs, N)
    bigrams_hat = [all_bigrams[i] for i in ids_hat]
    return bigrams_hat

In [65]:
pair = 'contemporary methods'
w1, w2 = pair.split(' ')
pred_example(w1, w2)

['modern_methods',
 'traditional_methods',
 'conventional_methods',
 'historical_method',
 'modern_techniques']

In [66]:
pair = 'rephrase me'
w1, w2 = pair.split(' ')
pred_example(w1, w2)

['rephrase', 'you_something', 'damn_thing', 'says_something', 'say_something']

In [67]:
pair = 'say something'
w1, w2 = pair.split(' ')
pred_example(w1, w2)

['say_something',
 'know_something',
 'us_something',
 'that_something',
 'says_something']

In [68]:
pair = 'domain name'
w1, w2 = pair.split(' ')
pred_example(w1, w2)

['name', 'three_domain', 'temporary_name', 'appropriate_name', 'name_now']

In [69]:
pair = 'thug guy'
w1, w2 = pair.split(' ')
pred_example(w1, w2)

['thug', 'dirty_bastard', 'dumb_blonde', 'marty_and', 'little_guy']

In [70]:
pair = 'state_of the_art'
w1, w2 = pair.split(' ')
pred_example(w1, w2)

['trance_state', 'the_art', 'great_state', 'state_history', 'state_historical']

In [71]:
pair = 'state_of art'
w1, w2 = pair.split(' ')
pred_example(w1, w2)

['trance_state',
 'art',
 'western_art',
 'intellectual_history',
 'contemporary_culture']

In [72]:
pair = 'state of_art'
w1, w2 = pair.split(' ')
pred_example(w1, w2)

['western_art',
 'cultural_institutions',
 'that_institution',
 'national_art',
 'institution_from']

## Perform Ngram Arithmetics

In [76]:
def substract_add(n1, n2, n3, N=5):
    try:
        vec1 = np.array(all_vecs[all_bigrams.index(n1)])
    except(ValueError):
        print('OOV ngram; estimating vec for it')
        vec1 = pred_example(n1.split('_')[0], n1.split('_')[1], return_vec=True)
        
    vec2 = np.array(all_vecs[all_bigrams.index(n2)])
    vec3 = np.array(all_vecs[all_bigrams.index(n3)])
    #x = {'w1_vec_str': ' '.join([str(v) for v in vec1]), 'w2_vec_str': ' '.join([str(v) for v in vec2])}
    v_res = vec1 - vec2 + vec3
    # bi_vec = predictor.predict_json(x)['bigram_vecs_hat']
    _, ids_hat = get_closest(v_res, all_vecs, N)
    bigrams_hat = [all_bigrams[i] for i in ids_hat]
    return bigrams_hat

In [79]:
substract_add('hard_job', 'hard', 'easy', 10)

OOV ngram; estimating vec for it


['easy',
 'quick_way',
 'good_job',
 'easy_way',
 'getting_ready',
 'extra_effort',
 'little_need',
 'easygoing',
 'work_just',
 'lot_easier']

In [82]:
substract_add('like_it', 'like', 'hate', 10)

['hate_you',
 'hate',
 'hate_me',
 'fuckin',
 'i_hate',
 'hate_it',
 'wrong_you',
 'me_feel',
 'me_wrong',
 'you_something']

In [83]:
substract_add('like_you', 'you', 'me', 10)

['me',
 'like_me',
 'like_you',
 'me_now',
 'me_and',
 'like_i',
 'me_like',
 'brant_me',
 'me_away',
 'and_me']

In [94]:
substract_add('rap_music', 'rhymes', 'guitar', 10)

['guitar_music',
 'rock_guitar',
 'guitar',
 's_guitar',
 'guitar_acoustic',
 'guitar_producer',
 'guitar_dave',
 'guitar_guitar',
 'guitar_steve',
 'guitar_percussion']

In [95]:
substract_add('rock_music', 'guitar', 'rhymes', 10)

['rhymes',
 'rhymes_and',
 'rhymes_in',
 'it_rhymes',
 'pop_cultural',
 'punk_culture',
 'hip_hop',
 'alcopop',
 'hop_culture',
 'rhyme']

In [118]:
substract_add('london_city', 'city', 'country', 10)

['country_australia',
 'kingdom_london',
 'london_most',
 'london_all',
 'countrylink',
 'country',
 'london_also',
 'london_region',
 'london_free',
 'england_international']

In [119]:
substract_add('london_city', 'london', 'paris', 10)

['city_paris',
 'paris',
 'paris_paris',
 'paris_metro',
 'paris_commune',
 'paris_l',
 'paris_la',
 'paris_dakar',
 'ra_paris',
 'paris_new']

In [120]:
substract_add('london_city', 'england', 'france', 10)

['paris_metro',
 'city_paris',
 'paris_m',
 'paris_commune',
 'paris_cdg',
 'paris_l',
 'centrale_paris',
 'paris',
 'paris_dakar',
 'paris_lyon']