In [1]:
import time
import gensim
import numpy as np
#import pandas as pd
#import scipy.stats as st

from sklearn.neighbors import KDTree
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
!head -n 4 '../data/exp1-trj.t'

508 465 1641 857 3176 1346 1301 3303 3277 3977 4430 8513 9755 11383 9496 12228 11150 13279 9215 17279 14428 9279 14792 14310 18351 7997 15024 15267 15665 16329 15125 14591 14797 3
19 191 68 41 46 4 964 543 154 171 382 732 632 923 4815 460 273 439 607 908 726 6277 5338 5821 6933 5438 8005 10438 9812 17030 12351 12132 13988 9320 5581 13142 15720 14944 5048 11012
51 2263 345 53 120 405 803 585 692 1566 533 1728 1880 739 544 226 8
15 4 58 9 38 54 196 648 193 1022 575 1560 753 229 260 532 317 764 159 633 1304 207 798 2811 301 211 880 80 23 89 40 537 781 141 5900 240 38 54 655 1616 196 648 437 749 575


In [3]:
with open('../data/exp1-trj.t', 'r') as file:
    trajs_teste = []

    for line in file:
        traj = line.strip().split()# Divide cada linha(traj) em cels usando espaço como delimitador e convertendo-as em strings
        trajs_teste.append(traj)

In [4]:
print("Quantidade de trajetórias de teste:", len(trajs_teste))

Quantidade de trajetórias de teste: 101000


In [5]:
print(trajs_teste[2]) # query par

['51', '2263', '345', '53', '120', '405', '803', '585', '692', '1566', '533', '1728', '1880', '739', '544', '226', '8']


In [6]:
print(trajs_teste[1002]) # "alvo" da query par, ou seja, a query ímpar

['51', '430', '345', '120', '856', '131', '673', '585', '233', '2200', '533', '361', '1299', '1215', '66', '588', '7', '8']


In [7]:
!ls '../data/saved_models/D2V/'

d2v_w5_s64_ep30_dm0.model
d2v_w5_s64_ep30_dm0.model.dv.vectors.npy
d2v_w5_s64_ep40_dm0.model
d2v_w5_s64_ep40_dm0.model.dv.vectors.npy


In [8]:
# Carregando o modelo...
model = Doc2Vec.load('../data/saved_models/D2V/d2v_w5_s64_ep30_dm0.model')

In [9]:
# Size vocab
len(model.wv.key_to_index)

18827

In [10]:
model.vector_size

64

In [11]:
# Pegando as trajetórias mais similares a trajetória id = 0
mosts = model.dv.most_similar(0)
mosts

[(1, 0.9527559280395508),
 (6, 0.9382349252700806),
 (4, 0.9378328323364258),
 (2, 0.9327696561813354),
 (7, 0.9076344966888428),
 (8, 0.9008456468582153),
 (3, 0.8869067430496216),
 (12, 0.8747737407684326),
 (5, 0.8740744590759277),
 (13, 0.870564877986908)]

In [12]:
def get_embedding_for_all_trajs(trajs):
    emb_trajs = np.zeros((len(trajs),) + (model.vector_size,), dtype=np.float32) # shape --> (len(trajs), vector_size). Ex. para Dq, shape --> (1000, 64)

    idx = 0
    for traj in trajs:
        emb_trajs[idx] = model.infer_vector(traj, epochs=model.epochs)
        idx += 1

        percent_done = (idx / len(trajs)) * 100 # Calcula a porcentagem concluída
        print(f"Progresso: {percent_done:.2f}% concluído", end="\r")  # A opção `end="\r"` permite que a impressão seja substituída na mesma linha

    return emb_trajs

In [13]:
# Segmentando: query (trajs pares) e dbsearch (querys ímpar + 99000 outras ímpares)
query = trajs_teste[:1000] # trajs query (pares)
dbsearch = trajs_teste[1000:101000] # dbsearch trajs (as 1000 primeiras são as query ímpar)

In [14]:
print(len(query))
print(len(dbsearch))

1000
100000


In [15]:
# Pegando os embeddings das trajs query:
query = get_embedding_for_all_trajs(query)

Progresso: 100.00% concluído

In [16]:
# Pegando os embeddings das trajs do dbsearch:
dbsearch = get_embedding_for_all_trajs(dbsearch)

Progresso: 100.00% concluído

## Time efficiency of D2V using KDTree 

In [17]:
def knn(q, db, k):
    tree = KDTree(db)
    
    start_time = time.time()
    for i in range(len(q)):
        _, ind = tree.query([q[i]], k=k)
    end_time = time.time()
    elapsed_time = round(end_time - start_time, 2)
    print(f"Knn time: {elapsed_time} segundos, with dbsize: {len(db)}")

In [18]:
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    knn(query, dbsearch[:dbsize], 50)

Knn time: 1.19 segundos, with dbsize: 20000
Knn time: 4.41 segundos, with dbsize: 40000
Knn time: 7.94 segundos, with dbsize: 60000
Knn time: 10.52 segundos, with dbsize: 80000
Knn time: 14.33 segundos, with dbsize: 100000
