In [1]:
import gensim
import numpy as np
from gensim.models import KeyedVectors

In [2]:
uk_emb = KeyedVectors.load_word2vec_format("C:/Users/User/Desktop/uk2rus/cc.uk.300.vec")

In [3]:
ru_emb = KeyedVectors.load_word2vec_format("C:/Users/User/Desktop/uk2rus/cc.ru.300.vec")

In [4]:
def load_word_pairs(filename):
    uk_ru_pairs = []
    uk_vectors = []
    ru_vectors = []
    with open(filename, "r",encoding="utf8") as inpf:
        for line in inpf:
            uk, ru = line.rstrip().split("\t")
            if uk not in uk_emb or ru not in ru_emb:
                continue
            uk_ru_pairs.append((uk, ru))
            uk_vectors.append(uk_emb[uk])
            ru_vectors.append(ru_emb[ru])
    return uk_ru_pairs, np.array(uk_vectors), np.array(ru_vectors)

In [5]:
uk_ru_test, X_test, Y_test = load_word_pairs("ukr_rus.test.txt")

In [6]:
uk_ru_train, X_train, Y_train = load_word_pairs("ukrrus.train.txt")

In [7]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
_ = model.fit(X_train, Y_train)

In [8]:
august = model.predict(uk_emb["серпень"].reshape(1, -1))
ru_emb.most_similar(august)

[('апрель', 0.854159414768219),
 ('июнь', 0.8411962985992432),
 ('март', 0.8397400379180908),
 ('сентябрь', 0.8359216451644897),
 ('февраль', 0.8328749537467957),
 ('октябрь', 0.8311805725097656),
 ('ноябрь', 0.8278146982192993),
 ('июль', 0.8236350417137146),
 ('август', 0.8120613098144531),
 ('декабрь', 0.8038000464439392)]

In [17]:
def precision(pairs, mapped_vectors, topn=1):
    assert len(pairs) == len(mapped_vectors)
    num_matches = 0
    for i, (_, ru) in enumerate(pairs):
        similar = ru_emb.most_similar(mapped_vectors[i], topn=topn)
        for n in range(len(similar)):
            if ru == similar[n][0]:
                num_matches = num_matches + 1
    precision_val = num_matches / len(pairs)
    return precision_val

In [18]:
assert precision([("серпень", "август")], august, topn=5) == 0.0
assert precision([("серпень", "август")], august, topn=9) == 1.0
assert precision([("серпень", "август")], august, topn=10) == 1.0

In [11]:
assert precision(uk_ru_test, X_test) == 0.0
assert precision(uk_ru_test, Y_test) == 1.0

In [22]:
precision_top1 = precision(uk_ru_test, model.predict(X_test), 1)
precision_top5 = precision(uk_ru_test, model.predict(X_test), 5)

precision_top1

0.6356589147286822

In [26]:
def learn_transform(X_train, Y_train):
    u, s, vt = np.linalg.svd(np.matmul(X_train.transpose(), Y_train))
    W = np.matmul(u, vt)
    return W

In [27]:
W = learn_transform(X_train, Y_train)

In [28]:
W

array([[-0.04726764, -0.00039346,  0.01660891, ...,  0.01075402,
         0.05630323, -0.02920388],
       [ 0.01875379,  0.07762805, -0.02526646, ..., -0.04825189,
         0.05523313,  0.00287794],
       [-0.00289267,  0.00492156,  0.03302269, ..., -0.09383073,
         0.0010823 , -0.02547004],
       ...,
       [ 0.06924202,  0.02402507,  0.00025672, ..., -0.10239008,
        -0.0104623 , -0.01620188],
       [ 0.07994906,  0.039838  ,  0.06650155, ...,  0.05040342,
         0.10387777, -0.06498513],
       [ 0.00486841,  0.04323363, -0.03774844, ..., -0.002401  ,
        -0.07146894, -0.00777142]], dtype=float32)

In [29]:
ru_emb.most_similar([np.matmul(uk_emb["серпень"], W)])

[('апрель', 0.8237906694412231),
 ('сентябрь', 0.8049710988998413),
 ('март', 0.802565336227417),
 ('июнь', 0.8021842837333679),
 ('октябрь', 0.8001735806465149),
 ('ноябрь', 0.793448269367218),
 ('февраль', 0.7914119958877563),
 ('июль', 0.790810763835907),
 ('август', 0.7891013622283936),
 ('декабрь', 0.7686371803283691)]

In [24]:
precision_top5

0.8113695090439277

In [30]:
assert precision(uk_ru_test, np.matmul(X_test, W)) >= 0.653
assert precision(uk_ru_test, np.matmul(X_test, W), 5) >= 0.824

In [91]:
with open("FairyTale.txt", "r", encoding="utf8") as inpf:
    uk_sentences = [line.rstrip().lower() for line in inpf]

In [69]:
def translate(sentence):
    ru_sentence = ""
    uk_embedings = []
    words = sentence.split(" ")
    print(words)
    for word in words:
        uk_embedings.append(uk_emb[word])
    ru_embedings = []
    for emb in uk_embedings:
        ru_embedings.append(ru_emb.most_similar([np.matmul(emb, W)]))
    for word_n_vec in ru_embedings:
        ru_sentence = ru_sentence + " " + word_n_vec[0][0]
    return ru_sentence

In [72]:
translate(".")

['.']


' .'

In [92]:
for sentence in uk_sentences:
    print("src: {}\ndst: {}\n".format(sentence, translate(sentence)))

['лисичка', '-', 'сестричка', 'і', 'вовк']
src: лисичка - сестричка і вовк
dst:  лисичка – сестричка и волк

['як', 'була', 'собі', 'лисичка', 'та', 'зробила', 'хатку,', 'та', 'й', 'живе.', 'а', 'це', 'приходять', 'холоди.', 'от', 'лисичка', 'замерзла', 'та', 'й', 'побігла', 'в', 'село', 'вогню', 'добувать,', 'щоб', 'витопити.', 'прибігає', 'до', 'одної', 'баби', 'та', 'й', 'каже:']


KeyError: "Key 'хатку,' not present"

KeyError: "Key 'панібрат' not present"