# Metrics

If a summary captures well the meaning of a body text, then if we use that summary in an information retrieval index to index the documents, we expect to see query results (rankings) similar to the ones in which we rank by whole body text.


We can use the Spearman Footrule Distance: we perform a query Q and obtain the ranking Kb of documents indexed by body text, and Ks of documents indexed by summary. We compute the distance of Ks from Kb. 

In [55]:
def spearman_distance_normalized(vec1, vec2):
    distance = 0
    max_distance = len(vec1) + 1
    max_total_distance = max_distance*len(vec1)
    for index1, element1 in enumerate(vec1):
        for index2, element2 in enumerate(vec2):
            if(element1 == element2):
                distance += abs(index1-index2)
                break
            if(index2 == (len(vec2)-1)):
                distance += max_distance
                
    return 1 - distance/max_total_distance

In [59]:
vec1 = ["a1", "a2", "a4", "a5"]
vec2 = ["a2", "a1", "a4", "a3"]

print(f"distance: {spearman_distance_normalized(vec1, vec2)}")

distance: 7


## Load Data ##


In [None]:
import requests
from tqdm.notebook import tqdm

HOST = "http://144.24.201.133:5000"
articles_json = []
rows = []

for i in tqdm(range(10)):
    articles_json = requests.get(f"{HOST}/allPapers?skip={i*1000}").json()

    for paper in articles_json:
        rows.append(
            (
                paper["title"],
                paper["body"],
                paper["summary"]
            )
        )

# Instanciate Model

In [9]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3')
model = AutoModel.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3').cuda()

## Vectorize docs

In [29]:
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def embed_sentences(sentences):
    #Clean df titles, bodies and summaries:
    #chunk[payload] = chunk[payload].apply(clean_text)
    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to("cuda")

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    # Perform pooling. In this case, max pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings[0]
    # vectors = []

    # for doc in tqdm(df["body"].to_list()):
    #     vectors.append(model.encode(doc))

In [None]:
body_vectors = []
summary_vectors = []
for paper in tqdm(rows):
    body_vectors.append({"body": paper[0], "embedding": embed_sentences(paper[1]).cpu().numpy()})
    summary_vectors.append({"body": paper[0], "embedding": embed_sentences(paper[2]).cpu().numpy()})

In [47]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

data_bodies = [body_vector["embedding"] for body_vector in body_vectors]
data_summaries = [summary_vector["embedding"] for summary_vector in summary_vectors]

nbrs_bodies = NearestNeighbors(n_neighbors=100, algorithm='brute', metric="cosine").fit(data_bodies)
nbrs_summares = NearestNeighbors(n_neighbors=100, algorithm='brute', metric="cosine").fit(data_summaries)

In [52]:
query = embed_sentences("Radiology and machine learning medical application").cpu().numpy()
# print(query)
result_bodies = nbrs_bodies.kneighbors([query])
result_summaries = nbrs_summares.kneighbors([query])

print(spearman_distance(result_bodies[1][0], result_summaries[1][0]))

896
