In [20]:
import numpy as np
from numpy.linalg import norm
import pandas as pd

In [2]:
from gensim.models import KeyedVectors

In [3]:
ref = ["Mon prenom est marius"]
can = ["Je suis marius"]

### 1 - Token Representation

In [4]:
def model_load(model):
    assert(type(model) == str)
    if model == "Word2Vec":
        wordvector_path = r'D:\COURS\A4\S8\Stage\Documents\bert_score\marius_scripts\custom_BERTScore\GoogleNews-vectors-negative300.bin.gz'
        emb = KeyedVectors.load_word2vec_format(wordvector_path, binary=True)
    else:
        print("Model not currently supported")
    return emb

In [55]:
def encode(corpus, model):
    encoded_corpus = []
    unknown = 0
    for sentence in corpus:
        encoded_sentence = []
        for word in sentence.split(" "):
            try:
                encoded_sentence.append(model[word])
            except:
                unknown += 1
        encoded_corpus.append(encoded_sentence)
    return np.array(encoded_corpus, dtype=object), unknown

In [5]:
w2v = model_load("Word2Vec")

In [88]:
#references, n_unknown_ref = encode(["I am Marius", "I like trains"], w2v)
#candidates, n_unknown_cand = encode(["My name is Marius", "I enjoy rail vehicules"], w2v)

references, n_unknown_ref = encode(["I am Marius"], w2v)
candidates, n_unknown_cand = encode(["My name is Marius"], w2v)

In [84]:
# Padding
new_references = []
new_candidates = []

for reference, candidate in zip(references, candidates):
    size_diff = len(reference) - len(candidate)
    if size_diff >= 0:
        candidate = np.pad(candidate, (0, size_diff))
        reference = np.array(reference)
    else:
        reference = np.pad(reference, [(0, np.abs(size_diff)), (0, 0)], mode="constant")
        candidate = np.array(candidate)
    new_references.append(reference)
    new_candidates.append(candidate)
   

references = np.array(new_references, dtype=object)
candidates = np.array(new_candidates, dtype=object)

I SHOULD ADD PADDING

In [89]:
print("Shape Reference : ", references.shape, "||", "Unknown Token Reference : ", n_unknown_ref)
print("Shape Candidate : ", candidates.shape, "||", "Unknown Token Candidate : ", n_unknown_cand)

Shape Reference :  (1, 3, 300) || Unknown Token Reference :  0
Shape Candidate :  (1, 4, 300) || Unknown Token Candidate :  0


### 2 - Similarity Measure

In [90]:
proximity = lambda x, y: (np.matmul(np.transpose(x), y))/norm(x)*norm(y)

all_proximities = []

for candidate, reference in zip(candidates, references):
    proximities = []
    for c_word in candidate:
        sub_proximities = []
        for r_word in reference:
            sub_proximities.append(proximity(c_word, r_word))
        proximities.append(sub_proximities)
    all_proximities.append(proximities)    

In [91]:
np.array(all_proximities, dtype=object).shape

(1, 4, 3)

In [92]:
all_proximities

[[[2.9310541990186416, 1.3797478327726513, 0.37488800612906137],
  [0.5219474879491862, 0.09263090012223345, 0.21187016296062705],
  [1.185017868316171, 2.5864861676789763, -0.009107585579498858],
  [0.20817003150982835, 0.15117115242823131, 8.293031040699134]]]