# Loading word embeddings

1. Run the first cell. When the upload button appears, upload `ngrams.txt`, `both_64.txt`, `lstm_64.txt`, and `img_aug_64.txt`
2. Run the remaining cells in order. After about a minute of computing distances, interactive controls will appear at the bottom of the page.

The sliders control which pairs of signs are displayed. You can adjust the minimum and maximum similarity allowed for each model, as well as filter out rare signs. The default settings show signs with high LM similarity and relatively low image similarity (signs that seem to function similarly but don't look like variants.)

In [None]:
import io
import numpy as np
from ipywidgets import interact, FloatSlider, IntSlider, widgets

upload = widgets.FileUpload(
    accept='',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
    multiple=True  # True to accept multiple files upload else False
)
display(upload)

In [None]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
#     with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
    with io.BytesIO(upload.value[emb_path]["content"]) as f:
        next(f)
        for i, line in enumerate(f):
            line = line.decode('utf-8')
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice : %s'%(word,)
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

both_path = 'both_64.txt' # LM with image inputs
img_path = 'img_aug_64.txt' # Image classification
# img_path = './embeddings/img_pre_64.txt' # Image classification
lm_path = 'lstm_64.txt' # LM with sign names

nmax = 50000  # maximum number of word embeddings to load

both_embeddings, both_id2word, both_word2id = load_vec(both_path, nmax)
img_embeddings, img_id2word, img_word2id = load_vec(img_path, nmax)
lm_embeddings, lm_id2word, lm_word2id = load_vec(lm_path, nmax)

import json
from collections import defaultdict
ngrams = defaultdict(int)
with io.BytesIO(upload.value['ngrams.txt']['content']) as fp:
    ngrams.update( json.load(fp) )
    
print("Setup complete.")

# Comparing Embeddings

In [None]:
def cosine(word_emb, word2_emb):
    similarity = (word2_emb / np.linalg.norm(word2_emb)).dot(word_emb / np.linalg.norm(word_emb))
    return similarity

def get_similarity(word, word2, embeddings, word2id):
    word_emb = embeddings[word2id[word]]
    word2_emb = embeddings[word2id[word2]]
    similarity = cosine(word_emb, word2_emb)
    return similarity

pairs = []
all_words = sorted(list(set(
    word for word in both_word2id 
    if word in img_word2id 
    and word in lm_word2id
)))

for i, word1 in enumerate(all_words):
    print('Computing distances %.02f%%'%(100*i/len(all_words)),end='\r')
    for j, word2 in enumerate(all_words):
        if i >= j:
            continue

        both_similarity = get_similarity(word1, word2, both_embeddings, both_word2id)
        img_similarity = get_similarity(word1, word2, img_embeddings, img_word2id)
        lm_similarity = get_similarity(word1, word2, lm_embeddings, lm_word2id)
        
        pairs.append( (both_similarity, lm_similarity, img_similarity, word1, word2) )
print('Finished computing distances')

In [None]:
def compare_signs(
    min_sim_lm, max_sim_lm, 
    min_sim_img, max_sim_img, 
    min_sim_both, max_sim_both, 
    #output_size, 
    min_freq, 
    sort_key):

    output_size = 500 # Limit this so updates aren't terribly slow when user adjusts sliders
    
    print("{0}\t{1}\t\t{2}".format(" COMBINED", "    LM", "  IMAGE"))

    # Filter pairs that match the slider values:
    result = []
    for both_sim, lm_sim, img_sim, word1, word2 in pairs:
        count1 = ngrams[word1.replace("-","~")]
        count2 = ngrams[word2.replace("-","~")]
        if count1 < min_freq or count2 < min_freq:
            continue
        if (img_sim >= min_sim_img and img_sim <= max_sim_img) \
        and (lm_sim >= min_sim_lm and lm_sim <= max_sim_lm) \
        and (both_sim >= min_sim_both and both_sim <= max_sim_both):
            result.append( (both_sim, lm_sim, img_sim, word1, word2) )
            
    # Sort and display in order:
    for i, (both_sim, lm_sim, img_sim, word1, word2) \
    in enumerate(sorted(result,key=lambda x:x[sort_key],reverse=True)):
        count1 = ngrams[word1.replace("-","~")]
        count2 = ngrams[word2.replace("-","~")]
        if count1 < min_freq or count2 < min_freq:
            continue
        print("{0:+f}\t{1:+f}\t{2:+f}\t{3} ({5}) \t{4} ({6})".format(both_sim, lm_sim, img_sim, word1, word2, 
                                                                    count1, count2
                                                                   ))
        if i > output_size:
            break
            
interact(
    compare_signs,
    min_sim_lm=FloatSlider(min=-1, max=1, step=0.01, value=0.8, continuous_update=False),
    max_sim_lm=FloatSlider(min=-1, max=1, step=0.01, value=1, continuous_update=False),
    min_sim_img=FloatSlider(min=-1, max=1, step=0.01, value=-1, continuous_update=False),
    max_sim_img=FloatSlider(min=-1, max=1, step=0.01, value=0.4, continuous_update=False),
    min_sim_both=FloatSlider(min=-1, max=1, step=0.01, value=-1, continuous_update=False),
    max_sim_both=FloatSlider(min=-1, max=1, step=0.01, value=1, continuous_update=False),
    #output_size=IntSlider(min=0, max=10000, step=200, value=300, continuous_update=False),
    min_freq=IntSlider(min=0, max=1000, step=1, value=5, continuous_update=False),
    sort_key=[('both',0), ('lm',1), ('img',2)],
)