In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.5 MB/s[0m eta [36m0:00:0

In [2]:
import pandas as pd
import numpy as np
import torch
import json
import scipy.stats as st
import torch.nn.utils.rnn as rnn_utils

from sklearn.metrics.pairwise import cosine_distances
from transformers import BertConfig, BertForMaskedLM, BertTokenizer, BertModel

In [3]:
!ls '../data/'

df_tdrive_ALL-taxis.csv  exp1-trj.h5	 tdrive.csv		   val2.src
D_p-pts.pickle		 exp1-trj.label  tdrive_formato_t2vec.csv  val.mta
D_q-pts.pickle		 exp1-trj.t	 train.mta		   val.src
Dq-pts.pickle		 mv.csv		 train.src		   val.trg
exp1-querydb.h5		 saved_models	 train.trg		   vocab.txt


In [4]:
# Load the tokenizer of t2vec
vocab_file_dir = '../data/vocab.txt'
tokenizer =  BertTokenizer.from_pretrained(vocab_file_dir)



In [5]:
!ls '../data/saved_models/BERT/'

best_model	   checkpoint-140000  checkpoint-150000  tmp
checkpoint-135000  checkpoint-145000  checkpoint-155000


In [28]:
# Carregando o modelo treinado:
config = BertConfig.from_json_file('../data/saved_models/BERT/best_model/config.json')
config.output_hidden_states=True
model = BertModel.from_pretrained('../data/saved_models/BERT/best_model/', local_files_only=True, config=config)

Some weights of the model checkpoint at ../data/saved_models/BERT/best_model/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../data/saved_models/BERT/best_model/ and are newly initialized: ['bert.pooler.dense.we

In [7]:
def get_embedding_mean_for_all_trajs(list_trajs):
    tokenized_trajs = list_trajs
    indexed_trajs_tokens = [tokenizer.convert_tokens_to_ids(traj) for traj in tokenized_trajs]

    # Preenchendo as sequências para ter o mesmo comprimento (valor de preenchimento padrão = 0)
    padded_inputs = rnn_utils.pad_sequence([torch.tensor(seq) for seq in indexed_trajs_tokens], batch_first=True)
    #padded_inputs = padded_inputs.to(device)

    with torch.no_grad():
        outputs = model(padded_inputs)

    # Calcula a média dos embeddings de cada sentença (traj)
    sentence_embeddings = torch.mean(outputs.last_hidden_state, dim=1)
    return sentence_embeddings

In [8]:
trajectories = [['17321', '10721', '17321', '10721', '11693'], ['17321', '10721', '11693']]

In [9]:
embs = get_embedding_mean_for_all_trajs(trajectories)
embs

tensor([[ 0.5071,  0.0927,  0.8889,  ..., -0.1824,  0.5921, -0.5153],
        [ 0.9438,  0.2829,  0.3659,  ..., -0.5627,  0.6390, -0.6088]])

In [10]:
emb_dim = embs.shape[1]
emb_dim

768

In [11]:
!head -n 21 '../data/exp1-trj.t'

3
3
3
3
3
3
3
3
3
3
9926 3189 88 7163 4076 13490 19059 13124 7310 5479
9042 3107 4616 5742 4717 6339 15188 18899 14919 7274 11265 1081 7609 1351 393
427 5334 8388 8967 4445 2056
542 4013 17293 3322 11357 483 11138
481 8071 657 2231 1769 173 1579 6198
854 3107 5989 8778 6218 7338
5773 8574 13056 11926 15551 6461
4134 3764 9341 15412 13024 9086
962 9873 469 373 6096 567 8620
14405 4076 5305 3107 6807 1155
4913 3694 3340 5104 95


In [12]:
# Carregando as trajs de teste:
trajs_teste = []
with open('../data/exp1-trj.t') as f:
    for line in f:
        traj_list = line.strip().split()
        trajs_teste.append(traj_list)

In [13]:
print("Quantidade de trajetórias de teste:", len(trajs_teste))

Quantidade de trajetórias de teste: 50500


In [14]:
print(trajs_teste[10]) # --> primeira query (traj par)

['9926', '3189', '88', '7163', '4076', '13490', '19059', '13124', '7310', '5479']


In [15]:
print(trajs_teste[510]) # --> mais semelhante a primeira query, ou seja, traj ímpar!

['9926', '8231', '92', '12594', '14968', '11048', '7821', '577']


In [16]:
# Segmentando: query (trajs pares) e dbsearch (querys ímpar + 49500 outras ímpares)
query = trajs_teste[:500] # trajs query (pares)
dbsearch = trajs_teste[500:50500] # dbsearch trajs (as 500 primeiras são as query ímpar)

In [17]:
print(len(query))
print(len(dbsearch))

500
50000


In [18]:
# Abaixo, pegamos os embeddings das trajs por lotes (10 em 10 trajs). Assim, evita-se estouro de memória...

In [19]:
%%time
query_aux = torch.zeros(len(query), emb_dim)

for i in range(0, len(query), 10):
    query_aux[i:i+10] = get_embedding_mean_for_all_trajs(query[i:i+10]) # Pegando de 10 em 10
    
    # Calcula a porcentagem concluída
    percent_done = ((i / len(query)) * 100)+1
    print(f"Progresso: {percent_done:.2f}% concluído", end="\r")  # A opção `end="\r"` permite que a impressão seja substituída na mesma linha
    
query = query_aux

CPU times: user 2min 22s, sys: 48.2 ms, total: 2min 22s
Wall time: 36.1 s


In [20]:
%%time
dbsearch_aux = torch.zeros(len(dbsearch), emb_dim)

for i in range(0, len(dbsearch), 10):
    dbsearch_aux[i:i+10] = get_embedding_mean_for_all_trajs(dbsearch[i:i+10]) # Pegando de 10 em 10
    
    # Calcula a porcentagem concluída
    percent_done = ((i / len(dbsearch)) * 100)+1
    print(f"Progresso: {percent_done:.2f}% concluído", end="\r")  # A opção `end="\r"` permite que a impressão seja substituída na mesma linha
    
dbsearch = dbsearch_aux

CPU times: user 3h 37min 28s, sys: 7.5 s, total: 3h 37min 36s
Wall time: 55min 54s


In [21]:
def rank(t_query, i, db_search): 
    dists = cosine_distances(t_query.reshape(1, -1), db_search) # pega todas as distâncias de Dq[i] as trajs do db_search (D_qUD_p)
    dists = dists.flatten()
    order = dists.argsort() # pega a ordem
    ranks = order.argsort() # pega o rank

    return ranks[i] + 1 # retorna o ranking de ta' no dbsearch | soma +1 pq o argsort rankea a partir de 0 (zero)

In [22]:
def acc(ranks):
    count = 0
    for i in range(len(ranks)):
        if ranks[i] == 1:
            count += 1

    return round(count/len(ranks), 2)

In [23]:
def mr(ranks):
    return (sum(ranks)/len(ranks))

In [24]:
def mrr(ranks):
    count = 0
    for i in range(len(ranks)):
        count += 1/ranks[i]

    return round(count/len(ranks), 2)

In [25]:
# Intervalo de Confiança do Ranks
def cip_r(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(ranks[i]) # Add os Ranks
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [26]:
# Intervalo de Confiança dos Reciprocal Ranks
def cip_rr(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(1/ranks[i]) # Add os RRs...
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

## Usando Discretização de Cels do t2vec:

### BertConfig():
    hidden_size=768,
    num_hidden_layers=6,
    num_attention_heads=12,
    max_position_embeddings=512

In [118]:
%%time
# ep1, s768, best_model
dbsizes = [10000, 20000, 30000, 40000, 50000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta em fatias do dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 83.344(51.105, 115.583), Acc: 0.66, MRR: 0.72(0.688, 0.759) with dbsearch size: 10000
Mean rank: 151.736(92.822, 210.65), Acc: 0.65, MRR: 0.7(0.668, 0.741) with dbsearch size: 20000
Mean rank: 207.646(124.241, 291.051), Acc: 0.65, MRR: 0.7(0.66, 0.735) with dbsearch size: 30000
Mean rank: 262.304(157.7, 366.908), Acc: 0.64, MRR: 0.69(0.649, 0.724) with dbsearch size: 40000
Mean rank: 345.338(205.169, 485.507), Acc: 0.62, MRR: 0.68(0.64, 0.715) with dbsearch size: 50000
CPU times: user 11min 5s, sys: 16min 50s, total: 27min 56s
Wall time: 7min 22s


In [73]:
%%time
# ep1, s768, checkpoint-110000
dbsizes = [10000, 20000, 30000, 40000, 50000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta em fatias do dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 96.698(51.093, 142.303), Acc: 0.64, MRR: 0.71(0.671, 0.743) with dbsearch size: 10000
Mean rank: 180.668(92.038, 269.298), Acc: 0.63, MRR: 0.69(0.653, 0.726) with dbsearch size: 20000
Mean rank: 256.87(126.934, 386.806), Acc: 0.61, MRR: 0.67(0.632, 0.707) with dbsearch size: 30000
Mean rank: 328.794(157.884, 499.704), Acc: 0.6, MRR: 0.66(0.621, 0.697) with dbsearch size: 40000
Mean rank: 417.794(201.379, 634.209), Acc: 0.59, MRR: 0.65(0.613, 0.69) with dbsearch size: 50000
CPU times: user 11min 31s, sys: 17min, total: 28min 31s
Wall time: 7min 30s
