In [8]:
import numpy as np
from numpy.linalg import norm
import pandas as pd

In [7]:
from gensim.models import KeyedVectors

In [4]:
ref = ["Mon prenom est marius"]
can = ["Je suis marius"]

### 1 - Token Representation

In [5]:
def model_load(model):
    assert(type(model) == str)
    if model == "Word2Vec":
        wordvector_path = r'D:\COURS\A4\S8\Stage\Documents\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\custom_BERTScore\GoogleNews-vectors-negative300.bin.gz'
        emb = KeyedVectors.load_word2vec_format(wordvector_path, binary=True)
    if model == "Glove":
        glove_path = r'D:\COURS\A4\S8\Stage\Documents\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\custom_BERTScore\glove2word2vec.txt'
        emb = KeyedVectors.load_word2vec_format(glove_path)
    else:
        print("Model not currently supported")
    return emb

In [6]:
def encode(corpus, model):
    encoded_corpus = []
    unknown = 0
    for sentence in corpus:
        encoded_sentence = []
        for word in sentence.split(" "):
            try:
                encoded_sentence.append(model[word])
            except:
                unknown += 1
        encoded_corpus.append(encoded_sentence)
    return np.array(encoded_corpus, dtype=object), unknown

##### Word2Vec

In [7]:
w2v = model_load("Word2Vec")

In [8]:
#references, n_unknown_ref = encode(["I am Marius", "I like trains"], w2v)
#candidates, n_unknown_cand = encode(["My name is Marius", "I enjoy rail vehicules"], w2v)

references, n_unknown_ref = encode(["I am Marius"], w2v)
candidates, n_unknown_cand = encode(["My name is Marius"], w2v)

In [9]:
print("Shape Reference : ", references.shape, "||", "Unknown Token Reference : ", n_unknown_ref)
print("Shape Candidate : ", candidates.shape, "||", "Unknown Token Candidate : ", n_unknown_cand)

Shape Reference :  (1, 3, 300) || Unknown Token Reference :  0
Shape Candidate :  (1, 4, 300) || Unknown Token Candidate :  0


##### Fasttext

In [1]:
from gensim.models.fasttext import load_facebook_vectors
from gensim.test.utils import datapath, get_tmpfile

In [None]:
ft = load_facebook_vectors(r'D:\COURS\A4\S8\Stage\Documents\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\custom_BERTScore\wiki-news-300d-1M.vec')

In [None]:
cap_path = datapath("crime-and-punishment.bin")
wv = load_facebook_vectors(cap_path)

##### Glove

Conversition to word2vec format (Do not compile unless necessary)

In [19]:
glove_path = r"D:\COURS\A4\S8\Stage\Documents\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\custom_BERTScore\glove.6B\glove.6B.300d.txt"
glove_temp = KeyedVectors.load_word2vec_format(glove_path, no_header=True)
glove_temp.save_word2vec_format(r"D:\COURS\A4\S8\Stage\Documents\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\custom_BERTScore\glove2word2vec.txt")

Loading of formated glove file :

In [24]:
finalGlove = KeyedVectors.load_word2vec_format(r"D:\COURS\A4\S8\Stage\Documents\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\custom_BERTScore\glove2word2vec.txt")

### 2 - Similarity Measure

In [40]:
def SimilarityCandToRef(references, candidates):
    proximity = lambda x, y: (np.matmul(np.transpose(x), y))/(norm(x)*norm(y))

    all_proximities = []

    for candidate, reference in zip(candidates, references):
        proximities = []
        for c_word in candidate:
            sub_proximities = []
            for r_word in reference:
                sub_proximities.append(proximity(r_word, c_word))
            proximities.append(sub_proximities)
        all_proximities.append(proximities)
    return all_proximities

def SimilarityRefToCand(references, candidates):
    proximity = lambda x, y: (np.matmul(np.transpose(x), y))/(norm(x)*norm(y))

    all_proximities = []

    for candidate, reference in zip(candidates, references):
        proximities = []
        for r_word in reference:
            sub_proximities = []
            for c_word in candidate:
                sub_proximities.append(proximity(r_word, c_word))
            proximities.append(sub_proximities)
        all_proximities.append(proximities)
    return all_proximities

In [41]:
candToRef = SimilarityCandToRef(references, candidates)
refToCand = SimilarityRefToCand(references, candidates)

In [42]:
candToRef

[[[0.519005613856723, 0.18616442214576517, 0.04520518544899319],
  [0.09242192671658575, 0.012498354833030686, 0.025547976598766665],
  [0.2098326692090991, 0.3489852938028225, -0.00109822157119661],
  [0.0368609406903856, 0.020396980932637056, 1.0]]]

In [43]:
refToCand

[[[0.519005613856723,
   0.09242192671658575,
   0.2098326692090991,
   0.0368609406903856],
  [0.18616442214576517,
   0.012498354833030686,
   0.3489852938028225,
   0.020396980932637056],
  [0.04520518544899319, 0.025547976598766665, -0.00109822157119661, 1.0]]]

### 3 - Calculation of P, R and F

##### R calculation

In [44]:
fullSum = []
for individualSimilarity in candToRef:
    currentSum = 0
    for row in individualSimilarity:
        currentSum += row[np.argmax(row)]
    fullSum.append(currentSum)
R = []
for sum, reference in zip(fullSum, references):
    R.append((1/norm(reference))*sum)


In [45]:
R

[0.42425704559238325]

##### P Calculation

In [46]:
fullSum = []
for individualSimilarity in refToCand:
    currentSum = 0
    for row in individualSimilarity:
        currentSum += row[np.argmax(row)]
    fullSum.append(currentSum)
P = []
for sum, candidate in zip(fullSum, candidates):
    P.append((1/norm(candidate))*sum)

In [47]:
P

[0.3604158590807159]

##### F Calculation

In [48]:
F = []

for r, p in zip(R, P):
    f = 2*((p*r)/(p+r))
    F.append(f)

In [49]:
F

[0.38973938477441966]

##### Full Calculation

In [None]:
def computeMetrics(candToRef, refToCand):
    # R computation
    fullSum = []
    for individualSimilarity in candToRef:
        currentSum = 0
        for row in individualSimilarity:
            currentSum += row[np.argmax(row)]
        fullSum.append(currentSum)
    R = []
    for sum, reference in zip(fullSum, references):
        R.append((1/norm(reference))*sum)

    # P computation
    fullSum = []
    for individualSimilarity in refToCand:
        currentSum = 0
        for row in individualSimilarity:
            currentSum += row[np.argmax(row)]
        fullSum.append(currentSum)
    P = []
    for sum, candidate in zip(fullSum, candidates):
        P.append((1/norm(candidate))*sum)
    
    # F computation
    F = []
    for r, p in zip(R, P):
        f = 2*((p*r)/(p+r))
        F.append(f)
    
    return (R, R, F)

### 4 - Basic Test of the package

In [28]:
from custom_score.score import score
from custom_score.utils import model_load

In [2]:
w2v = model_load("Word2Vec")

In [3]:
score(w2v)

([0.40998854847735094], [0.22880340017752968], [0.2937005518713847])

In [29]:
score(finalGlove)

([0.03954665010154433], [0.05171590963278553], [0.04481993467824077])

# HOLDALL

In [None]:
# Padding
new_references = []
new_candidates = []

for reference, candidate in zip(references, candidates):
    size_diff = len(reference) - len(candidate)
    if size_diff >= 0:
        candidate = np.pad(candidate, (0, size_diff))
        reference = np.array(reference)
    else:
        reference = np.pad(reference, [(0, np.abs(size_diff)), (0, 0)], mode="constant")
        candidate = np.array(candidate)
    new_references.append(reference)
    new_candidates.append(candidate)
   

references = np.array(new_references, dtype=object)
candidates = np.array(new_candidates, dtype=object)