In [1]:
import spacy as sp
from spacy import displacy
from custom_score.score import *
from custom_score.utils import serialized_to_model

  from .autonotebook import tqdm as notebook_tqdm


In [40]:
nlp = sp.load("en_core_web_sm")

#### Basic test

In [38]:
ref = ["Marius is very cool.", "Yesterday was the best day ever."]
can = ["Nice person.", "I enjoyed yesterday."]

In [44]:
new = nlp(ref[1])

In [45]:
type(new)

spacy.tokens.doc.Doc

In [46]:
displacy.render(new, style='dep', jupyter=True, options={'distance': 130})

In [25]:
for token in new:
    print(token.text, token.dep_, token.head.text)

Marius nsubj is
is ROOT is
very advmod cool
cool acomp is
. punct is


In [28]:
for token in new:
    print(token.text, token.pos_)

Marius PROPN
is AUX
very ADV
cool ADJ
. PUNCT


#### Try on longer sentence

In [47]:
sentence = nlp("Sometimes, while working with data, we can have a problem in which we need to gather information of average length of String data in list. This kind of information might be useful in Data Science domain. Let’s discuss certain ways in which this task can be performed.")
displacy.render(sentence, style='dep', jupyter=True, options={'distance': 130})

#### Splitfunction

In [3]:
def gramSplitter(corpus, lang = "en_core_web_sm"):
    """
    Classifies words of multiple texts depending on their grammatical familly.

    :param1 corpus (list): List of text in string format.
    :param2 lang (string): Spacy language library identifier.

    :output ldic (list): List of grammatical familly dictinnaries. 
    """
    nlp = sp.load(lang)
    ldic = []

    for doc in corpus:
        doc = nlp(doc)
        dic = {}
        for token in doc:
            if token.pos_ in dic.keys():
                dic[token.pos_].append(token.text)
            else:
                dic[token.pos_] = [token.text]
        ldic.append(dic)
    return ldic        

In [46]:
gramSplitter([ref, can])

[{'PROPN': ['Marius'],
  'AUX': ['is'],
  'ADV': ['very'],
  'ADJ': ['cool'],
  'PUNCT': ['.']},
 {'ADJ': ['Nice'], 'NOUN': ['person'], 'PUNCT': ['.']}]

#### grammatical static bertscore

In [4]:
def computeGramMetrics(refToCand, candToRef, raw_references, raw_candidates):
    """
    Calculates R, P and F measures for a given corpus

    :param1 refToCand (list): List of similarity matrix between each reference/candidate couple.
    :param2 candToRef (list): List of similarity matrix between each reference/candidate couple.
    :param3 references (list): List of reference sentences.
    :param4 candidates (list): List of candidate sentences.

    :output (tuple): Tuple containing R, P and F for the current corpus.
    """
    # R computation
    fullSum = []
    for individualSimilarity in refToCand:
        currentSum = 0
        for k in individualSimilarity.keys():
            for row in individualSimilarity[k]:
                currentSum += np.max(row)
        fullSum.append(currentSum)
    R = []
    for sum, reference in zip(fullSum, raw_references):
        lenRef = len(reference.split(" "))
        try:
            R.append((1/(lenRef))*sum)
        except ZeroDivisionError:
            R.append(0.)
    
    # P compuatation
    fullSum = []
    for individualSimilarity in candToRef:
        currentSum = 0
        for k in individualSimilarity.keys():
            for row in individualSimilarity[k]:
                currentSum += np.max(row)
        fullSum.append(currentSum)
    P = []
    for sum, candidate in zip(fullSum, raw_candidates):
        lenCand = len(candidate.split(" "))
        try:
            P.append((1/(lenCand))*sum)
        except ZeroDivisionError:
            P.append(0.)
    
    # F computation
    F = []
    for r, p in zip(R, P):
        try:
            f = 2*((p*r)/(p+r))
        except ZeroDivisionError:
            f = 0
        F.append(f)
    
    return (R, P, F)


In [27]:
def gramScore(model, candidates=["I am Marius"], references=["Marius is my name"], withIdf = False):
    """
    Computes BERTScore using a custom embedding amongst Word2Vec, Fasttext and Glove with in addition to grammatical scacy classification.

    :param1 model (dict): Dictionnary of the embedding.
    :param2 references (list): List of reference sentences.
    :param3 candidates (list): List of candidate sentences.

    :output formatedScores (List): List containing tuples of R, P and F for each couple of the current corpus.
    """
    #storing raw references for IDF Calculus
    raw_references = [reference for reference in references]
    raw_candidates = [candidate for candidate in candidates]

    #classifing with grammatical context
    references = gramSplitter(references)
    candidates = gramSplitter(candidates)

    #encoding to vectors
    tempL = []
    for reference in references:
        tempD = {} 
        for familly in reference.keys():
            tempD[familly], _ = encode(reference[familly], model)
        tempL.append(tempD)
    references = tempL

    tempL = []
    for candidate in candidates:
        tempD = {} 
        for familly in candidate.keys():
            tempD[familly], _ = encode(candidate[familly], model)
        tempL.append(tempD)
    candidates = tempL

    del tempD
    del tempL

    #discriminate none-common classes
    tempLCan = []
    tempLRef = []
    for candidate, reference in zip(candidates, references):
        try:
            candidate.pop("PUNCT")
            reference.pop("PUNCT")
        except KeyError:
            pass

        tempDCan = {}
        tempDCan["others"] = []
        for k in candidate.keys():
            if k not in reference.keys():
                tempDCan["others"].extend(candidate[k])
            else:
                tempDCan[k] = candidate[k]
        if tempDCan["others"] == []:
            tempDCan.pop("others")

        tempDRef = {}
        tempDRef["others"] = []
        for k in reference.keys():
            if k not in candidate.keys():
                tempDRef["others"].extend(reference[k])
            else:
                tempDRef[k] = reference[k]
        if tempDRef["others"] == []:
            tempDRef.pop("others")
        tempLCan.append(tempDCan)
        tempLRef.append(tempDRef)
    references = tempLRef
    candidates = tempLCan

    del tempDCan
    del tempDRef
    del tempLCan
    del tempLRef

    #cosine similarity
    candToRef = []
    refToCand = []

    for reference, candidate in zip(references, candidates):
        ctr = {}
        rtc = {}
        for k in reference.keys():
            ctr[k] = similarityCandToRef([reference[k]], [candidate[k]])
            rtc[k] = np.transpose(ctr[k])
        candToRef.append(ctr)
        refToCand.append(rtc)
    
    #metrics calculation

    (R, P, F) = computeGramMetrics(refToCand, candToRef, raw_references, raw_candidates)
    
    if withIdf == True:
        print("Not yet implementated: Default gramScore used instead")
        (R, P, F) = computeGramMetrics(refToCand, candToRef, raw_references, raw_candidates)
    else:
        (R, P, F) = computeGramMetrics(refToCand, candToRef, raw_references, raw_candidates)

    formatedScores = [(r, p, f) for r, p, f in zip(R, P, F)]
    return formatedScores

In [6]:
w2v = serialized_to_model(r"C:\Pro\Stages\A4 - DVRC\Work\Models\serialized_w2v.pkl")

In [7]:
ref = ["Marius is very cool.", "Yesterday was the best day ever."]
can = ["Nice person.", "I enjoyed yesterday."]

In [28]:
gramScore(w2v, can, ref, False)

[(1.199140614044269, 0.03411487839814096, 0.06634235400707554),
 (1.0251813761211712, 0.02433025718765311, 0.04753244414524873)]

In [77]:
ref = ["officials of the cabinet-level fair trade commission -lrb- ftc -rrb- said friday that they have formed an ad hoc group to investigate whether there is any manipulation of commodity prices by traders in local market."]
can = ["fair trade commission investigating consumer price hike"]

In [14]:
gramScore(w2v, can, ref)

[(1.0, 2.0, 1.3333333333333333), (0.8333333333333335, 2.0, 1.1764705882352942)]