In [1]:
import torch
import numpy as np
import scipy.stats as st

from sklearn.metrics.pairwise import cosine_distances
from sentence_transformers import SentenceTransformer, util

In [2]:
!ls '../data/saved_models/SBERT/'

1000  10000  2000  3000  4000  5000  6000  7000  8000  9000


In [3]:
## Carregando o SBERT:
model = SentenceTransformer('../data/saved_models/SBERT/9000')

In [4]:
# Now you can use the loaded model to encode sentences
sentence1 = '506 112 144 148 250 258 384'
emb1 = model.encode(sentence1)
sentence2 = '506 112 144 148 258 384'
emb2 = model.encode(sentence2)

cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity:", cos_sim.item())

Cosine-Similarity: 0.9790315628051758


In [5]:
# Carregando as trajs de teste:
trajs_teste = []
with open('../data/exp1-trj.t') as f:
    for line in f:
        traj_list = line.strip().split()
        trajs_teste.append(traj_list)

In [6]:
print("Quantidade de trajetórias de teste:", len(trajs_teste))

Quantidade de trajetórias de teste: 101000


In [7]:
print(trajs_teste[2]) # query par

['51', '2263', '345', '53', '120', '405', '803', '585', '692', '1566', '533', '1728', '1880', '739', '544', '226', '8']


In [8]:
print(trajs_teste[1002]) # "alvo" da query par, ou seja, a query ímpar

['51', '430', '345', '120', '856', '131', '673', '585', '233', '2200', '533', '361', '1299', '1215', '66', '588', '7', '8']


In [9]:
# As trajs de testes estão uma lista de listas, onda cada lista insterna contém uma traj tokenizada:
# [['3176', '1346', '1301', '3303'], ..., ['508', '465', '1641']] 
# Como SBERT codifica cada sentença (e.x: '508', '465', '1641') para embedding, usamos a função abaixo 
# que recebe uma traj tokenizada e a retorna em formato de sentence string:

In [10]:
def traj2str(traj):
    """
    input: ['75476610', '75466888', '75476610', '754960']
      out: '75476610 75466888 75476610 754960'
    """
    string_traj = ' '.join(traj)
    return string_traj

In [11]:
lista = ['55', '3', '104', '244']
traj2str(lista)

'55 3 104 244'

In [12]:
def get_embeddings_for_all_sentences(trajs):
    """
    Input: list of list de trajs. Trajetória formada por ids cels.
    (e.x. trajs = [['30405995', '30413746', '30421497'], ['30429247', '30429248', '30436998']])
    Outpu: embedding de cada trajetória/sentença completa (traj) fornecido diretamente pelo SBERT
    """

    t_emb = model.encode(traj2str(trajs[0]))
    list_embs = np.empty([len(trajs), t_emb.shape[0]], dtype=np.float32)

    i = 0
    total = len(trajs)
    for traj in trajs:
        list_embs[i] = model.encode(traj2str(traj))
        i += 1
      
        # Calcula a porcentagem concluída
        percent_done = (i / total) * 100
        # Exibe a porcentagem concluída
        print(f"Progresso: {percent_done:.2f}% concluído", end="\r")  # A opção `end="\r"` permite que a impressão seja substituída na mesma linha

    return list_embs

In [13]:
# Segmentando: query (trajs pares) e dbsearch (querys ímpar + 99000 outras ímpares)
query = trajs_teste[:1000] # trajs query (pares)
dbsearch = trajs_teste[1000:101000] # dbsearch trajs (as 1000 primeiras são as query ímpar)

In [14]:
print(len(query))
print(len(dbsearch))

1000
100000


In [15]:
%%time
query = get_embeddings_for_all_sentences(query)

CPU times: user 4.14 s, sys: 23 ms, total: 4.16 s
Wall time: 4.16 s


In [16]:
%%time
dbsearch = get_embeddings_for_all_sentences(dbsearch)

CPU times: user 7min 31s, sys: 263 ms, total: 7min 32s
Wall time: 7min 31s


In [17]:
def rank(t_query, i, db_search): 
    dists = cosine_distances(t_query.reshape(1, -1), db_search) # pega todas as distâncias de Dq[i] as trajs do db_search (D_qUD_p)
    dists = dists.flatten()
    order = dists.argsort() # pega a ordem
    ranks = order.argsort() # pega o rank

    return ranks[i] + 1 # retorna o ranking de ta' no dbsearch | soma +1 pq o argsort rankea a partir de 0 (zero)

In [18]:
def acc(ranks):
    count = 0
    for i in range(len(ranks)):
        if ranks[i] == 1:
            count += 1

    return round(count/len(ranks), 2)

In [19]:
def mr(ranks):
    return (sum(ranks)/len(ranks))

In [20]:
def mrr(ranks):
    count = 0
    for i in range(len(ranks)):
        count += 1/ranks[i]

    return round(count/len(ranks), 2)

In [21]:
# Intervalo de Confiança do Ranks
def cip_r(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(ranks[i]) # Add os Ranks
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [22]:
# Intervalo de Confiança dos Reciprocal Ranks
def cip_rr(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(1/ranks[i]) # Add os RRs...
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [23]:
%%time
# checkpoint-9000 (best_model)
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta' no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 1.771(1.349, 2.193), Acc: 0.92, MRR: 0.95(0.935, 0.958) with dbsearch size: 20000
Mean rank: 2.414(1.68, 3.148), Acc: 0.9, MRR: 0.93(0.915, 0.942) with dbsearch size: 40000
Mean rank: 3.078(2.027, 4.129), Acc: 0.88, MRR: 0.91(0.9, 0.93) with dbsearch size: 60000
Mean rank: 3.92(2.451, 5.389), Acc: 0.87, MRR: 0.9(0.889, 0.921) with dbsearch size: 80000
Mean rank: 4.634(2.8, 6.468), Acc: 0.86, MRR: 0.9(0.881, 0.914) with dbsearch size: 100000
CPU times: user 24min 17s, sys: 38min 37s, total: 1h 2min 54s
Wall time: 18min 37s
