In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.5 MB/s[0m eta [36m0:00:0

In [2]:
import pandas as pd
import numpy as np
import torch
import json
import scipy.stats as st
import torch.nn.utils.rnn as rnn_utils

from sklearn.metrics.pairwise import cosine_distances
from transformers import BertConfig, BertForMaskedLM, BertTokenizer, BertModel

In [3]:
!ls '../data/'

D_p-pts.pickle	exp1-trj.h5	exp1-trj.t   README.md	   train.trg  vocab.txt
D_q-pts.pickle	exp1-trj.label	exp2-trj.h5  saved_models  val.src
Dq-pts.pickle	exp1-trj.pts	porto.csv    train.src	   val.trg


In [4]:
# Load the tokenizer of t2vec
vocab_file_dir = '../data/vocab.txt'
tokenizer =  BertTokenizer.from_pretrained(vocab_file_dir)



In [5]:
!ls '../data/saved_models/BERT/'

best_model	  checkpoint-15000  s1024  s64
checkpoint-10000  checkpoint-20000  s256   s768


In [6]:
# Carregando o modelo treinado:
config = BertConfig.from_json_file('../data/saved_models/BERT/best_model/config.json')
config.output_hidden_states=True
model = BertModel.from_pretrained('../data/saved_models/BERT/best_model/', local_files_only=True, config=config)

Some weights of the model checkpoint at ../data/saved_models/BERT/best_model/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../data/saved_models/BERT/best_model/ and are newly initialized: ['bert.pooler.dense.bi

In [7]:
def get_embedding_mean_for_all_trajs(list_trajs):
    tokenized_trajs = list_trajs
    indexed_trajs_tokens = [tokenizer.convert_tokens_to_ids(traj) for traj in tokenized_trajs]

    # Preenchendo as sequências para ter o mesmo comprimento (valor de preenchimento padrão = 0)
    padded_inputs = rnn_utils.pad_sequence([torch.tensor(seq) for seq in indexed_trajs_tokens], batch_first=True)
    #padded_inputs = padded_inputs.to(device)

    with torch.no_grad():
        outputs = model(padded_inputs)

    # Calcula a média dos embeddings de cada sentença (traj)
    sentence_embeddings = torch.mean(outputs.last_hidden_state, dim=1)
    return sentence_embeddings

In [8]:
trajectories = [['506', '112', '144', '148', '250', '258', '384'], ['148', '250', '258', '384']]

In [9]:
vecs = get_embedding_mean_for_all_trajs(trajectories)
vecs

tensor([[ 0.5671,  1.0142,  0.4156,  ...,  0.9171, -0.3727,  1.4593],
        [ 0.5671,  1.0142,  0.4156,  ...,  0.9171, -0.3727,  1.4593]])

In [10]:
vecs.shape

torch.Size([2, 2048])

In [11]:
# dimensão dos embeddings
dim = vecs.shape[1]
dim

2048

In [12]:
!head -n 4 '../data/exp1-trj.t'

508 465 1641 857 3176 1346 1301 3303 3277 3977 4430 8513 9755 11383 9496 12228 11150 13279 9215 17279 14428 9279 14792 14310 18351 7997 15024 15267 15665 16329 15125 14591 14797 3
19 191 68 41 46 4 964 543 154 171 382 732 632 923 4815 460 273 439 607 908 726 6277 5338 5821 6933 5438 8005 10438 9812 17030 12351 12132 13988 9320 5581 13142 15720 14944 5048 11012
51 2263 345 53 120 405 803 585 692 1566 533 1728 1880 739 544 226 8
15 4 58 9 38 54 196 648 193 1022 575 1560 753 229 260 532 317 764 159 633 1304 207 798 2811 301 211 880 80 23 89 40 537 781 141 5900 240 38 54 655 1616 196 648 437 749 575


In [13]:
# Carregando as trajs de teste:
trajs_teste = []
with open('../data/exp1-trj.t') as f:
    for line in f:
        traj_list = line.strip().split()
        trajs_teste.append(traj_list)

In [14]:
print("Quantidade de trajetórias de teste:", len(trajs_teste))

Quantidade de trajetórias de teste: 101000


In [15]:
print(trajs_teste[0]) # --> primeira query (traj par)

['508', '465', '1641', '857', '3176', '1346', '1301', '3303', '3277', '3977', '4430', '8513', '9755', '11383', '9496', '12228', '11150', '13279', '9215', '17279', '14428', '9279', '14792', '14310', '18351', '7997', '15024', '15267', '15665', '16329', '15125', '14591', '14797', '3']


In [16]:
print(trajs_teste[1000]) # --> mais semelhante a primeira query, ou seja, traj ímpar!

['1089', '465', '1123', '2173', '185', '307', '553', '4012', '3442', '4296', '4498', '7490', '7819', '7630', '9013', '11297', '11988', '10689', '18272', '12037', '11976', '15001', '15262', '15156', '14901', '7997', '13957', '16913', '16146', '15069', '15704', '16382', '16121', '16271', '3']


In [17]:
# Segmentando: query (trajs pares) e dbsearch (querys ímpar + 99000 outras ímpares)
query = trajs_teste[:1000] # trajs query (pares)
dbsearch = trajs_teste[1000:101000] # dbsearch trajs (as 1000 primeiras são as query ímpar)

In [18]:
print(len(query))
print(len(dbsearch))

1000
100000


In [19]:
# Abaixo, pegamos os embeddings das trajs por lotes (10 em 10 trajs). Assim, evita-se estouro de memória...

In [20]:
%%time
query_aux = torch.zeros(1000, dim)

for i in range(0, len(query), 10):
    query_aux[i:i+10] = get_embedding_mean_for_all_trajs(query[i:i+10]) # Pegando de 10 em 10
    
    # Calcula a porcentagem concluída
    percent_done = ((i / len(query)) * 100)+1
    print(f"Progresso: {percent_done:.2f}% concluído", end="\r")  # A opção `end="\r"` permite que a impressão seja substituída na mesma linha
    
query = query_aux

CPU times: user 19min 57s, sys: 3.53 s, total: 20min 1s
Wall time: 5min 4s


In [None]:
%%time
dbsearch_aux = torch.zeros(100000, dim)

for i in range(0, len(dbsearch), 10):
    dbsearch_aux[i:i+10] = get_embedding_mean_for_all_trajs(dbsearch[i:i+10]) # Pegando de 10 em 10
    
    # Calcula a porcentagem concluída
    percent_done = ((i / len(dbsearch)) * 100)+1
    print(f"Progresso: {percent_done:.2f}% concluído", end="\r")  # A opção `end="\r"` permite que a impressão seja substituída na mesma linha
    
dbsearch = dbsearch_aux

Progresso: 50.86% concluído

In [None]:
def rank(t_query, i, db_search): 
    dists = cosine_distances(t_query.reshape(1, -1), db_search) # pega todas as distâncias de Dq[i] as trajs do db_search (D_qUD_p)
    dists = dists.flatten()
    order = dists.argsort() # pega a ordem
    ranks = order.argsort() # pega o rank

    return ranks[i] + 1 # retorna o ranking de ta' no dbsearch | soma +1 pq o argsort rankea a partir de 0 (zero)

In [None]:
def acc(ranks):
    count = 0
    for i in range(len(ranks)):
        if ranks[i] == 1:
            count += 1

    return round(count/len(ranks), 2)

In [None]:
def mr(ranks):
    return (sum(ranks)/len(ranks))

In [None]:
def mrr(ranks):
    count = 0
    for i in range(len(ranks)):
        count += 1/ranks[i]

    return round(count/len(ranks), 2)

In [None]:
# Intervalo de Confiança do Ranks
def cip_r(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(ranks[i]) # Add os Ranks
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [None]:
# Intervalo de Confiança dos Reciprocal Ranks
def cip_rr(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(1/ranks[i]) # Add os RRs...
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

## Usando Discretização de Cels do t2vec:

### BertConfig():
    hidden_size=2048,
    num_hidden_layers=6,
    num_attention_heads=16,
    max_position_embeddings=512

In [None]:
%%time
# bert model ep1, s2048, best_model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta em fatias do dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

### BertConfig():
    hidden_size=1024,
    num_hidden_layers=6,
    num_attention_heads=16,
    max_position_embeddings=512

In [28]:
%%time
# bert model ep1, s1024, best_model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta em fatias do dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 1.408(1.217, 1.599), Acc: 0.94, MRR: 0.96(0.947, 0.969) with dbsearch size: 20000
Mean rank: 1.784(1.409, 2.159), Acc: 0.91, MRR: 0.94(0.927, 0.952) with dbsearch size: 40000
Mean rank: 2.174(1.598, 2.75), Acc: 0.9, MRR: 0.93(0.915, 0.942) with dbsearch size: 60000
Mean rank: 2.629(1.83, 3.428), Acc: 0.88, MRR: 0.92(0.901, 0.931) with dbsearch size: 80000
Mean rank: 2.963(2.011, 3.915), Acc: 0.86, MRR: 0.91(0.891, 0.922) with dbsearch size: 100000
CPU times: user 30min 21s, sys: 38min 11s, total: 1h 8min 33s
Wall time: 21min 47s


### BertConfig():
    hidden_size=64,
    num_hidden_layers=6,
    num_attention_heads=8,
    max_position_embeddings=512

In [28]:
%%time
# bert model ep1, s64, best_model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta em fatias do dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 1.851(1.538, 2.164), Acc: 0.86, MRR: 0.91(0.894, 0.924) with dbsearch size: 20000
Mean rank: 2.618(2.043, 3.193), Acc: 0.82, MRR: 0.87(0.856, 0.891) with dbsearch size: 40000
Mean rank: 3.361(2.529, 4.193), Acc: 0.8, MRR: 0.85(0.834, 0.872) with dbsearch size: 60000
Mean rank: 4.241(3.105, 5.377), Acc: 0.77, MRR: 0.83(0.812, 0.852) with dbsearch size: 80000
Mean rank: 5.046(3.626, 6.466), Acc: 0.76, MRR: 0.82(0.801, 0.842) with dbsearch size: 100000
CPU times: user 11min 4s, sys: 29min 23s, total: 40min 28s
Wall time: 10min 11s


### BertConfig():
    hidden_size=256,
    num_hidden_layers=6,
    num_attention_heads=16,
    max_position_embeddings=512

In [30]:
%%time
# bert model ep1, s256, best_model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta em fatias do dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 1.957(1.285, 2.629), Acc: 0.9, MRR: 0.93(0.917, 0.944) with dbsearch size: 20000
Mean rank: 2.827(1.526, 4.128), Acc: 0.88, MRR: 0.91(0.896, 0.927) with dbsearch size: 40000
Mean rank: 3.643(1.767, 5.519), Acc: 0.86, MRR: 0.9(0.882, 0.915) with dbsearch size: 60000
Mean rank: 4.664(2.08, 7.248), Acc: 0.83, MRR: 0.88(0.861, 0.897) with dbsearch size: 80000
Mean rank: 5.615(2.312, 8.918), Acc: 0.82, MRR: 0.87(0.848, 0.884) with dbsearch size: 100000
CPU times: user 16min 6s, sys: 37min 23s, total: 53min 29s
Wall time: 13min 30s


### BertConfig():
    hidden_size=768,
    num_hidden_layers=6,
    num_attention_heads=12,
    max_position_embeddings=512

In [33]:
%%time
# bert model ep1, s768, best_model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta em fatias do dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 1.47(1.283, 1.657), Acc: 0.92, MRR: 0.94(0.933, 0.957) with dbsearch size: 20000
Mean rank: 1.872(1.531, 2.213), Acc: 0.88, MRR: 0.92(0.907, 0.935) with dbsearch size: 40000
Mean rank: 2.276(1.763, 2.789), Acc: 0.87, MRR: 0.91(0.895, 0.925) with dbsearch size: 60000
Mean rank: 2.779(2.073, 3.485), Acc: 0.85, MRR: 0.9(0.881, 0.913) with dbsearch size: 80000
Mean rank: 3.162(2.31, 4.014), Acc: 0.84, MRR: 0.89(0.871, 0.904) with dbsearch size: 100000
CPU times: user 24min 42s, sys: 37min 41s, total: 1h 2min 24s
Wall time: 18min 34s
