In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.5 MB/s[0m eta [36m0:00:0

In [1]:
import time
import json
import torch
import numpy as np
from sklearn.neighbors import KDTree
import torch.nn.utils.rnn as rnn_utils

from transformers import BertConfig, BertForMaskedLM, BertTokenizer, BertModel

In [2]:
!ls '../data/'

D_p-pts.pickle	exp1-trj.h5	exp1-trj.t   README.md	   train.trg  vocab.txt
D_q-pts.pickle	exp1-trj.label	exp2-trj.h5  saved_models  val.src
Dq-pts.pickle	exp1-trj.pts	porto.csv    train.src	   val.trg


In [3]:
# Load the tokenizer of t2vec
vocab_file_dir = '../data/vocab.txt'
tokenizer =  BertTokenizer.from_pretrained(vocab_file_dir)



In [4]:
!ls '../data/saved_models/BERT/'

best_model  checkpoint-15000


In [5]:
# Carregando o modelo treinado:
config = BertConfig.from_json_file('../data/saved_models/BERT/best_model/config.json')
config.output_hidden_states=True
model = BertModel.from_pretrained('../data/saved_models/BERT/best_model/', local_files_only=True, config=config)

Some weights of the model checkpoint at ../data/saved_models/BERT/best_model/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../data/saved_models/BERT/best_model/ and are newly initialized: ['bert.pooler.dense.we

In [6]:
def get_embedding_mean_for_all_trajs(list_trajs):
    tokenized_trajs = list_trajs
    indexed_trajs_tokens = [tokenizer.convert_tokens_to_ids(traj) for traj in tokenized_trajs]

    # Preenchendo as sequências para ter o mesmo comprimento (valor de preenchimento padrão = 0)
    padded_inputs = rnn_utils.pad_sequence([torch.tensor(seq) for seq in indexed_trajs_tokens], batch_first=True)
    #padded_inputs = padded_inputs.to(device)

    with torch.no_grad():
        outputs = model(padded_inputs)

    # Calcula a média dos embeddings de cada sentença (traj)
    sentence_embeddings = torch.mean(outputs.last_hidden_state, dim=1)
    return sentence_embeddings

In [7]:
trajectories = [['506', '112', '144', '148', '250', '258', '384'], ['148', '250', '258', '384']]

In [8]:
get_embedding_mean_for_all_trajs(trajectories)

tensor([[ 0.5477, -0.2066,  0.2435,  ..., -0.4758, -0.9001,  0.2682],
        [ 0.4124, -0.0651,  0.5063,  ..., -0.4328, -1.0516,  0.2307]])

In [9]:
!head -n 4 '../data/exp1-trj.t'

508 465 1641 857 3176 1346 1301 3303 3277 3977 4430 8513 9755 11383 9496 12228 11150 13279 9215 17279 14428 9279 14792 14310 18351 7997 15024 15267 15665 16329 15125 14591 14797 3
19 191 68 41 46 4 964 543 154 171 382 732 632 923 4815 460 273 439 607 908 726 6277 5338 5821 6933 5438 8005 10438 9812 17030 12351 12132 13988 9320 5581 13142 15720 14944 5048 11012
51 2263 345 53 120 405 803 585 692 1566 533 1728 1880 739 544 226 8
15 4 58 9 38 54 196 648 193 1022 575 1560 753 229 260 532 317 764 159 633 1304 207 798 2811 301 211 880 80 23 89 40 537 781 141 5900 240 38 54 655 1616 196 648 437 749 575


In [10]:
# Carregando as trajs de teste:
trajs_teste = []
with open('../data/exp1-trj.t') as f:
    for line in f:
        traj_list = line.strip().split()
        trajs_teste.append(traj_list)

In [11]:
print("Quantidade de trajetórias de teste:", len(trajs_teste))

Quantidade de trajetórias de teste: 101000


In [12]:
print(trajs_teste[0]) # --> primeira query (traj par)

['508', '465', '1641', '857', '3176', '1346', '1301', '3303', '3277', '3977', '4430', '8513', '9755', '11383', '9496', '12228', '11150', '13279', '9215', '17279', '14428', '9279', '14792', '14310', '18351', '7997', '15024', '15267', '15665', '16329', '15125', '14591', '14797', '3']


In [13]:
print(trajs_teste[1000]) # --> mais semelhante a primeira query, ou seja, traj ímpar!

['1089', '465', '1123', '2173', '185', '307', '553', '4012', '3442', '4296', '4498', '7490', '7819', '7630', '9013', '11297', '11988', '10689', '18272', '12037', '11976', '15001', '15262', '15156', '14901', '7997', '13957', '16913', '16146', '15069', '15704', '16382', '16121', '16271', '3']


In [14]:
# Segmentando: query (trajs pares) e dbsearch (querys ímpar + 99000 outras ímpares)
query = trajs_teste[:1000] # trajs query (pares)
dbsearch = trajs_teste[1000:101000] # dbsearch trajs (as 1000 primeiras são as query ímpar)

In [15]:
print(len(query))
print(len(dbsearch))

1000
100000


In [16]:
# Abaixo, pegamos os embeddings das trajs por lotes (10 em 10 trajs). Assim, evita-se estouro de memória...

In [17]:
%%time
query_aux = torch.zeros(1000, 768)

for i in range(0, len(query), 10):
    query_aux[i:i+10] = get_embedding_mean_for_all_trajs(query[i:i+10]) # Pegando de 10 em 10
    
    # Calcula a porcentagem concluída
    percent_done = ((i / len(query)) * 100)+1
    print(f"Progresso: {percent_done:.2f}% concluído", end="\r")  # A opção `end="\r"` permite que a impressão seja substituída na mesma linha
    
query = query_aux

CPU times: user 6min 6s, sys: 483 ms, total: 6min 6s
Wall time: 1min 38s


In [18]:
%%time
dbsearch_aux = torch.zeros(100000, 768)

for i in range(0, len(dbsearch), 10):
    dbsearch_aux[i:i+10] = get_embedding_mean_for_all_trajs(dbsearch[i:i+10]) # Pegando de 10 em 10
    
    # Calcula a porcentagem concluída
    percent_done = ((i / len(dbsearch)) * 100)+1
    print(f"Progresso: {percent_done:.2f}% concluído", end="\r")  # A opção `end="\r"` permite que a impressão seja substituída na mesma linha
    
dbsearch = dbsearch_aux

CPU times: user 8h 52min 48s, sys: 13.6 s, total: 8h 53min 1s
Wall time: 2h 15min 27s


In [19]:
print(type(query))
print(type(dbsearch))

<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [20]:
# Convertendo de Tensor pra Numpy
query = query.numpy()
dbsearch = dbsearch.numpy()

In [21]:
print(type(query))
print(type(dbsearch))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


## Time efficiency of BERT using KDTree 

### BertConfig():
    hidden_size=768, (embedding size)
    num_hidden_layers=6,
    num_attention_heads=12,
    max_position_embeddings=512

In [22]:
def knn(q, db, k):
    tree = KDTree(db)
    
    start_time = time.time()
    for i in range(len(q)):
        _, ind = tree.query([q[i]], k=k)
    end_time = time.time()
    elapsed_time = round(end_time - start_time, 2)
    print(f"Knn time: {elapsed_time} segundos, with dbsize: {len(db)}")

In [23]:
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    knn(query, dbsearch[:dbsize], 50)

Knn time: 18.77 segundos, with dbsize: 20000
Knn time: 37.57 segundos, with dbsize: 40000
Knn time: 59.62 segundos, with dbsize: 60000
Knn time: 75.26 segundos, with dbsize: 80000
Knn time: 104.32 segundos, with dbsize: 100000
