# Interesting properties of embeddings

* We will look at some of the properties w2v embeddings have
* Test them ourselves in code

In [1]:
#in case you forgot, this is how one can load the embeddings

from gensim.models import KeyedVectors

# load embedding model from a file
# binary: True if saved as binary file (.bin), False if saved as text file (.vectors or .txt for example)
# limit: How many words to read from the model
model_english=KeyedVectors.load_word2vec_format("/home/bio/gigaword-and-wikipedia.bin", binary=True, limit=100000) #koulun koneella home/bio
model_finnish=KeyedVectors.load_word2vec_format("/home/bio/pb34_wf_200_v2_skgram.bin", binary=True, limit=100000)

In [None]:
print("Most similar words for 'man':")
print(model_english.most_similar("man",topn=10))
print()
print("Most similar words for 'woman':")
print(model_finnish.most_similar("woman",topn=10))
print()

# Mapping spaces

* One of the more famous properties of the embeddings
* Learn a **linear** mapping from one language to another
* Can we replicate this?
* Learn a network with a single dense output layer
* English vector in -- Finnish vector out

## Training data

* Need English-Finnish pairs of words to train on
* ...google translate, maybe?
* Googling around finds this https://github.com/ssut/py-googletrans
* ...unofficial API, will get your IP banned if overused, so let's be careful!
* official API needs registration, etc.

`pip3 install --user googletrans` #install the library

In [None]:
from googletrans import Translator
translator=Translator()
translations=translator.translate(["locomotive","milk"],src="en",dest="fi")

for t in translations:
    print("origin=",t.origin,"text=",t.text)


* Seems to work fine!
* Let's grab some translations
* The docs say "max 16000 characters per request"
* We need to translate some hundreds of words at a time

In [2]:
print("English vocab",len(model_english.vocab))
print(model_english.vocab.__class__)
print(model_english.vocab["car"])
#We need a list, in order of frequency
words=sorted(model_english.vocab.items(),key=lambda word_dim:word_dim[1].count,reverse=True)
print(words[:5])
words_freq_sorted=[w for w,_ in words]
print("Freq sorted",words_freq_sorted[:5])

English vocab 100000
<class 'dict'>
Vocab(count:99365, index:635)
[('</s>', <gensim.models.keyedvectors.Vocab object at 0x7f9e983e7da0>), (',', <gensim.models.keyedvectors.Vocab object at 0x7f9e983e7e80>), ('the', <gensim.models.keyedvectors.Vocab object at 0x7f9e98404198>), ('.', <gensim.models.keyedvectors.Vocab object at 0x7f9e984041d0>), ('of', <gensim.models.keyedvectors.Vocab object at 0x7f9e6fe60c18>)]
Freq sorted ['</s>', ',', 'the', '.', 'of']


Now we have things like `</s>` and `.` in the vocabulary, those we don't want to translate

In [3]:
import re
english_word_re=re.compile("^[a-zA-Z]+$") #about as stupid simplification as you can get! #^=after[x]
final_word_list=[]
for w in words_freq_sorted:
    if english_word_re.match(w):
        final_word_list.append(w)
print(final_word_list[:20])

['the', 'of', 'to', 'and', 'in', 'a', 'for', 'The', 'is', 'that', 'was', 'on', 'with', 'said', 'as', 'by', 'at', 'from', 'he', 'his']


In [4]:
import re

#same thing as above, nicely packed into a function
def clean_vocab(gensim_model,regexp):
    words=sorted(gensim_model.vocab.items(),key=lambda word_dim:word_dim[1].count,reverse=True)
    words_freq_sorted=[w for w,_ in words]
    word_re=re.compile(regexp)
    final_word_list=[]
    for w in words_freq_sorted:
        if word_re.match(w):
            final_word_list.append(w)
    return final_word_list

finnish_vocab=clean_vocab(model_finnish,'^[a-zA-ZäöåÖÄÅ]+$')
english_vocab=clean_vocab(model_english,'^[a-zA-Z]+$')
print("Final Finnish",finnish_vocab[:15],"...",finnish_vocab[2000:2015])
print("Final English",english_vocab[:15],"...",english_vocab[2000:2015])


Final Finnish ['ja', 'on', 'ei', 'että', 'se', 'oli', 'mutta', 'tai', 'kun', 'myös', 'ovat', 'ole', 'niin', 'jos', 'kuin'] ... ['seurakunnan', 'selvä', 'tulleet', 'seuraavaan', 'sijasta', 'kuollut', 'I', 'asioihin', 'loput', 'luona', 'talven', 'per', 'ihanan', 'palvelua', 'tietokoneen']
Final English ['the', 'of', 'to', 'and', 'in', 'a', 'for', 'The', 'is', 'that', 'was', 'on', 'with', 'said', 'as'] ... ['Greece', 'rally', 'democracy', 'revenue', 'add', 'criticism', 'offices', 'Hussein', 'kids', 'relief', 'promised', 'advance', 'talking', 'boost', 'dispute']


In [None]:
#Little test (run through the google translate)
import time
def translate(words,src,dest,batch_size=1000):
    result=[] #[("dog","koira"),....]
    translator=Translator()
    for idx in range(0,len(words),batch_size):
        batch=words[idx:idx+batch_size]
        try:
            translations=translator.translate(batch,src=src,dest=dest)
            for t in translations:
                result.append((t.origin,t.text))
            time.sleep(0.2) #sleep between batches
            print(src,"->",dest,"batch at",idx,"....OK")
        except: #we end here, if the lines between try ... except throw an error
            print(src,"->",dest,"batch at",idx,"....FAILED")
            time.sleep(61) #sleep a little longer so Google is not angry
            print(src,"->",dest,"...RESTARTING")
            
    return result

x=translate(english_vocab[:50],"en","fi",20) # a small test

In [None]:
print(x)

* looks okay
* let's run this and save the result for later use, so we don't get banned

In [None]:
import json
en_fi=translate(english_vocab,"en","fi",batch_size=150)
with open("en_fi_transl.json","wt") as f:
    json.dump(en_fi,f)
fi_en=translate(finnish_vocab,"fi","en",batch_size=150)
with open("fi_en_transl.json","wt") as f:
    json.dump(fi_en,f)


* well, we got banned :D
* Let's just translate as text files, in the google translate interface
* (quality time manually feeding these into Google translate --- I could have used the official API :)
* ...but now it's done, so who cares

In [5]:
#dump 10K words at a time into a file, which can be fed to google translate
def build_files(words,fname,batch_size):
    for idx in range(0,len(words),batch_size):
        batch=words[idx:idx+batch_size]
        with open("trdata/{}_batch_{}.txt".format(fname,idx),"wt") as f:
            print("\n".join(batch),file=f)

build_files(english_vocab,"en-fi-source",10000)
build_files(finnish_vocab,"fi-en-source",10000)


* I built manually four files like this:

In [6]:
%%bash

ls trdata/fien_* trdata/enfi_*
wc -l trdata/fien_* trdata/enfi_*
echo "FI -> EN"
paste trdata/fien_source_all.txt trdata/fien_target_all.txt  | head -n 10 #(linuxissa) paste laittaa rivit vierekkäin
echo "EN -> FI"
paste trdata/enfi_source_all.txt trdata/enfi_target_all.txt  | head -n 10



trdata/enfi_source_all.txt
trdata/enfi_target_all.txt
trdata/fien_source_all.txt
trdata/fien_target_all.txt
  95275 trdata/fien_source_all.txt
  95275 trdata/fien_target_all.txt
  83618 trdata/enfi_source_all.txt
  83618 trdata/enfi_target_all.txt
 357786 total
FI -> EN
ja	and
on	is
ei	No
että	that
se	it
oli	was
mutta	but
tai	or
kun	when
myös	also
EN -> FI
the	
of	of
to	että
and	ja
in	sisään
a	
for	varten
The	
is	on
that	että


* Read in and filter
* To make sure we get high-quality stuff, we will look for same pairs in fin-eng and eng-fin direction
* That way we will also make sure our translations are among the top 100K words in both languages

In [7]:
fien=[] #list of (fin,eng) pairs obtained from the fin -> eng direction
enfi=[] #list of (fin,eng) pairs, this time obtained from  the eng->fin direction
with open("trdata/fien_source_all.txt") as fi_file, open("trdata/fien_target_all.txt") as en_file:
    for fi,en in zip(fi_file,en_file):
        fi=fi.strip()
        en=en.strip()
        if fi and en:
            fien.append((fi,en))

with open("trdata/enfi_target_all.txt") as fi_file, open("trdata/enfi_source_all.txt") as en_file:
    for fi,en in zip(fi_file,en_file):
        fi=fi.strip()
        en=en.strip()
        if fi and en:
            enfi.append((fi,en))

fien_set=set(fien)
enfi_set=set(enfi)
common=fien_set&enfi_set #keep only pairs which are shared (&)
print("Len fien",len(fien_set))
print("Len enfi",len(enfi_set))
print("Len common",len(common))
print(list(common)[:300])

Len fien 95275
Len enfi 83610
Len common 7100
[('Selma', 'Selma'), ('kiiltävä', 'shiny'), ('yhdistynyt', 'united'), ('laajennettu', 'extended'), ('Jens', 'Jens'), ('Miguel', 'Miguel'), ('odottamaton', 'unexpected'), ('toimitukset', 'deliveries'), ('hyppysellinen', 'pinch'), ('puuha', 'chore'), ('syystä', 'justly'), ('psykologit', 'psychologists'), ('turkoosi', 'turquoise'), ('oikeudenmukainen', 'fair'), ('tekijä', 'factor'), ('sopivasti', 'suitably'), ('säilytyspaikka', 'repository'), ('Burton', 'Burton'), ('Sherman', 'Sherman'), ('Barry', 'Barry'), ('Carlson', 'Carlson'), ('ab', 'ab'), ('PG', 'PG'), ('ein', 'ein'), ('hukkaan', 'wasted'), ('säännöllisyys', 'regularity'), ('G', 'G'), ('koulu', 'school'), ('elinten', 'bodies'), ('raidallinen', 'striped'), ('taistelee', 'fights'), ('teollisesti', 'industrially'), ('leijonat', 'lions'), ('etana', 'snail'), ('talousarvio', 'budget'), ('des', 'des'), ('hetkiä', 'moments'), ('neuvonantaja', 'adviser'), ('sekaannus', 'confusion'), ('suitset', 

* ouch - we lost most of the stuff, but such is life
* what we got looks good, though :)
* Let us yet filter away pairs like Ivan - Ivan


In [None]:
#Making sure all we found is in the top 100K - just crosschecking really
print(len(set(finnish_vocab)&set(fi for fi,en in common)))
print(len(set(english_vocab)&set(en for fi,en in common)))

#Making sure all words are there exactly once - no risk of mixing train and validation
print(len(set(fi for fi,en in common)))
print(len(set(en for fi,en in common)))
print("...all these four numbers should be the same")

In [8]:
import random
pairs=[(fi,en) for fi,en in common if fi!=en] #Only keep pairs where source does not equal target
print("Left with",len(pairs),"after removing identical pairs")
random.shuffle(pairs) #always, always make sure to shuffle!

print("Shuffled pairs",pairs[:20])

#Now we need to grab the vectors for the words in question
en_indices=[model_english.vocab[en].index for fi,en in pairs] #English
fi_indices=[model_finnish.vocab[fi].index for fi,en in pairs] #Finnish
print("Indices:",en_indices[:10],fi_indices[:10])
#...and the vectors are hidden in the models
print("English model.vectors shape:",model_english.vectors.shape)
print("Finnish model.vectors shape:",model_finnish.vectors.shape)
en_vectors=model_english.vectors[en_indices] #Selects the rows in just the correct order
fi_vectors=model_finnish.vectors[fi_indices] #Selects the rows in just the correct order
print("English selected vectors shape:",en_vectors.shape)
print("Finnish selected vectors shape:",fi_vectors.shape)





Left with 4624 after removing identical pairs
Shuffled pairs [('rivit', 'ranks'), ('mökki', 'cottage'), ('kuljettajat', 'drivers'), ('heittää', 'throw'), ('liikuttunut', 'touched'), ('salaisuudet', 'secrets'), ('kurja', 'wretched'), ('arvovaltainen', 'authoritative'), ('kirkkaus', 'brightness'), ('yksi', 'one'), ('muodostettu', 'formed'), ('kynttilä', 'candle'), ('nostot', 'withdrawals'), ('yhdiste', 'compound'), ('käyttämätön', 'unused'), ('yhdistys', 'association'), ('hätä', 'emergency'), ('ihailijoita', 'admirers'), ('innoissaan', 'excited'), ('jäätyy', 'freezes')]
Indices: [4000, 15918, 3366, 3812, 6611, 8645, 40806, 22247, 32016, 53] [13168, 11185, 21928, 1751, 86387, 23331, 20705, 63390, 22397, 91]
English model.vectors shape: (100000, 200)
Finnish model.vectors shape: (100000, 200)
English selected vectors shape: (4624, 200)
Finnish selected vectors shape: (4624, 200)


* Now `en_vectors` is vectors for the 4624 English words in our translation pairs
* `fi_vectors` is same for Finnish
* ...our training data is done - we have the pairs of input--desired output

## Learning transformation from English to Finnish (fi->en)


* 200-dim vector in, 200-dim vector out
* Loss needs to be different, this is not classification!
* `mse` stands for mean square error

In [9]:
import tensorflow as tf
### Only needed for me, not to block the whole GPU, you don't need this stuff
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))
### ---end of weird stuff

from keras.models import Model
from keras.layers import Input, Dense



inp=Input(shape=(fi_vectors.shape[1],)) #input is 200-dim (eng to fin) (en->fi ja vice versa)
outp=Dense(en_vectors.shape[1])(inp) #Simple linear transformation of the input (en->fi ja vice versa)

model=Model(inputs=[inp], outputs=[outp])
model.summary()

model.compile(optimizer="adam",loss="mse")
hist=model.fit(fi_vectors,en_vectors,batch_size=100,verbose=1,epochs=30,validation_split=0.1) (en->fi ja vice versa)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               40200     
Total params: 40,200
Trainable params: 40,200
Non-trainable params: 0
_________________________________________________________________
Train on 4161 samples, validate on 463 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Checking for accuracy

In [16]:
val_fi,val_en,_=hist.validation_data #This we saw before - the validation data
predicted_en=model.predict(val_fi) #Transform the Finnish vectors in the validation data
for fi,en,pred_en in list(zip(val_fi,val_en,predicted_en))[:30]:
    print(model_finnish.similar_by_vector(fi,topn=1)) #This is the target English word
    print(model_english.similar_by_vector(en,topn=1)) #This is the original Finnish word
    print(model_english.similar_by_vector(pred_en,topn=5)) # Top five closest hits to the transformed vector
    print("\n")
    #en->fi ja vice versa

[('reititin', 1.0000001192092896)]
[('router', 1.0)]
[('router', 0.8483683466911316), ('modem', 0.8167172074317932), ('adapter', 0.8042836785316467), ('Bluetooth', 0.7913797497749329), ('Ethernet', 0.7832474708557129)]


[('indeksi', 1.0)]
[('index', 1.0000001192092896)]
[('ratio', 0.6710296273231506), ('weighting', 0.6368964910507202), ('value', 0.6021862030029297), ('P/E', 0.5989536643028259), ('calculated', 0.5913202166557312)]


[('ihastunut', 0.9999997615814209)]
[('enamored', 0.9999999403953552)]
[('infatuated', 0.7519022822380066), ('smitten', 0.7447301149368286), ('envious', 0.731027364730835), ('obsessed', 0.7230865955352783), ('fascinated', 0.7076319456100464)]


[('liikkua', 1.0000001192092896)]
[('move', 1.0000001192092896)]
[('keep', 0.6287741661071777), ('crawl', 0.6268678307533264), ('walk', 0.6221586465835571), ('roam', 0.6202652454376221), ('wander', 0.6198495030403137)]


[('nälkä', 0.9999998211860657)]
[('hunger', 1.0)]
[('eating', 0.7361111640930176), ('eat', 0.6766

## Evaluating more formally

* Eyeballing the data is a moving target
* Ideally, we'd have a more solid metric
* Let us try top-1, top-5, and top-10 for the proportion of words which got the correct translation among top-N candidates

In [17]:
def eval(src_model,tgt_model,src_vecs,tgt_vecs,predicted_vecs):
    top1,top5,top10,total=0,0,0,0
    for src_v,tgt_v,pred_v in zip(src_vecs,tgt_vecs,predicted_vecs):
        src_word=src_model.similar_by_vector(src_v)[0][0]
        tgt_word=tgt_model.similar_by_vector(tgt_v)[0][0]
        hits=list(w for w,sim in tgt_model.similar_by_vector(pred_v,topn=10))
        total+=1
        if tgt_word==hits[0]:
            top1+=1
        if tgt_word in hits[:5]:
            top5+=1
        if tgt_word in hits[:10]:
            top10+=1
    print("Top1",top1/total*100,"percent correct")
    print("Top5",top5/total*100,"percent correct")
    print("Top10",top10/total*100,"percent correct")
eval(model_finnish,model_english,val_fi,val_en,predicted_en) #en->fi ja vice versa

Top1 32.6133909287257 percent correct
Top5 53.131749460043196 percent correct
Top10 59.611231101511876 percent correct


## Conclusion

* We have seen the vectors have interesting properties
* In particular, spaces can be mapped onto each other
* We have seen how this can be achieved with a simple linear transformation
* Optimal transformation has a closed-form solution, but we were lazy and trained it with Keras quite successfully
* This demonstrates how Keras can be used also for more generic tasks

In [18]:
# Extra stuff - a function to query the translations, so we can play around
def top_n(word,source_model,target_model,transformation_model,topn=5):
    try:
        source_idx=source_model.vocab[word].index
    except:
        print("Cannot retrieve vector for",word)
        return None
    mapped=transformation_model.predict(source_model.vectors[source_idx,:].reshape(1,-1))
    return target_model.similar_by_vector(mapped[0])
    
seen_words=set(en for fi,en in common) #These words were seen during training or validation
while True:
    wrd=input("word> ")
    if wrd=="end":
        break
    if wrd in seen_words:
        print("    WARNING: this word was seen during training")
    hits=top_n(wrd,model_english,model_finnish,model)
    for word,sim in hits:
        print("  ",word,"  ",sim)
    print()

word> koira
Cannot retrieve vector for koira


TypeError: 'NoneType' object is not iterable