In [1]:
import pickle
import json as js
import numpy as np
import pandas as pd
import scipy.stats as st
from math import sin, cos, sqrt, atan2, radians

In [2]:
df = pd.read_csv('../data/porto.csv')
df

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."
...,...,...,...,...,...,...,...,...,...
1710665,1404171463620000698,C,,,20000698,1404171463,A,False,"[[-8.612469,41.14602],[-8.612487,41.145993],[-..."
1710666,1404171367620000670,C,,,20000670,1404171367,A,False,"[[-8.610138,41.140845],[-8.610174,41.140935],[..."
1710667,1388745716620000264,C,,,20000264,1388745716,A,False,[]
1710668,1404141826620000248,B,,12.0,20000248,1404141826,A,False,"[[-8.630712,41.154885],[-8.63073,41.154813],[-..."


In [3]:
# Removendo trajs com dados faltantes. Ou seja, que MISSING_DATA é igual True!
df = df[df['MISSING_DATA'] == False]
df

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."
...,...,...,...,...,...,...,...,...,...
1710665,1404171463620000698,C,,,20000698,1404171463,A,False,"[[-8.612469,41.14602],[-8.612487,41.145993],[-..."
1710666,1404171367620000670,C,,,20000670,1404171367,A,False,"[[-8.610138,41.140845],[-8.610174,41.140935],[..."
1710667,1388745716620000264,C,,,20000264,1388745716,A,False,[]
1710668,1404141826620000248,B,,12.0,20000248,1404141826,A,False,"[[-8.630712,41.154885],[-8.63073,41.154813],[-..."


In [4]:
# Removendo trajs de comprimento zero --> []
df = df[df['POLYLINE'] != '[]']
df

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."
...,...,...,...,...,...,...,...,...,...
1710663,1388660427620000585,C,,,20000585,1388660427,A,False,"[[-8.60697,41.162283],[-8.60697,41.16231],[-8...."
1710665,1404171463620000698,C,,,20000698,1404171463,A,False,"[[-8.612469,41.14602],[-8.612487,41.145993],[-..."
1710666,1404171367620000670,C,,,20000670,1404171367,A,False,"[[-8.610138,41.140845],[-8.610174,41.140935],[..."
1710668,1404141826620000248,B,,12.0,20000248,1404141826,A,False,"[[-8.630712,41.154885],[-8.63073,41.154813],[-..."


In [5]:
# Ordenando em ordem crescente:
df_tmp = df.copy()
df_tmp.sort_values(by='TIMESTAMP', inplace=True)
df = df_tmp
del df_tmp
df

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
577,1372636853620000380,C,,,20000380,1372636853,A,False,"[[-8.610291,41.140746],[-8.6103,41.140755],[-8..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
10,1372636875620000233,C,,,20000233,1372636875,A,False,"[[-8.619894,41.148009],[-8.620164,41.14773],[-..."
64,1372636896620000360,C,,,20000360,1372636896,A,False,"[[-8.617599,41.146137],[-8.617581,41.14593],[-..."
...,...,...,...,...,...,...,...,...,...
1710544,1404172626620000387,B,,37.0,20000387,1404172626,A,False,"[[-8.656803,41.161446],[-8.656803,41.161446],[..."
1710412,1404172716620000359,B,,61.0,20000359,1404172716,A,False,"[[-8.599248,41.149197],[-8.598645,41.148504],[..."
1710511,1404172718620000066,A,2002.0,,20000066,1404172718,A,False,"[[-8.601192,41.181741],[-8.601201,41.181741],[..."
1710603,1404172736620000483,C,,,20000483,1404172736,A,False,"[[-8.58609,41.175504],[-8.586351,41.174955],[-..."


In [6]:
df.reset_index(inplace=True)
del df['index']
df

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636853620000380,C,,,20000380,1372636853,A,False,"[[-8.610291,41.140746],[-8.6103,41.140755],[-8..."
1,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
2,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
3,1372636875620000233,C,,,20000233,1372636875,A,False,"[[-8.619894,41.148009],[-8.620164,41.14773],[-..."
4,1372636896620000360,C,,,20000360,1372636896,A,False,"[[-8.617599,41.146137],[-8.617581,41.14593],[-..."
...,...,...,...,...,...,...,...,...,...
1704754,1404172626620000387,B,,37.0,20000387,1404172626,A,False,"[[-8.656803,41.161446],[-8.656803,41.161446],[..."
1704755,1404172716620000359,B,,61.0,20000359,1404172716,A,False,"[[-8.599248,41.149197],[-8.598645,41.148504],[..."
1704756,1404172718620000066,A,2002.0,,20000066,1404172718,A,False,"[[-8.601192,41.181741],[-8.601201,41.181741],[..."
1704757,1404172736620000483,C,,,20000483,1404172736,A,False,"[[-8.58609,41.175504],[-8.586351,41.174955],[-..."


In [7]:
# Pegando as trajs (linhas no df) correspondentes as trajs query e dbsearch (querydb)
with open('../data/exp1-trj.label', 'r') as file:
    # Leia as linhas do arquivo e converta para inteiros
    querydb_lines = [int(line.strip()) for line in file]

In [8]:
print(querydb_lines)

[1020000, 1020007, 1020016, 1020018, 1020019, 1020020, 1020021, 1020024, 1020025, 1020046, 1020054, 1020072, 1020076, 1020088, 1020105, 1020107, 1020109, 1020115, 1020119, 1020121, 1020124, 1020125, 1020126, 1020128, 1020133, 1020134, 1020138, 1020139, 1020140, 1020149, 1020151, 1020152, 1020158, 1020159, 1020160, 1020175, 1020188, 1020190, 1020194, 1020195, 1020200, 1020203, 1020209, 1020211, 1020213, 1020218, 1020222, 1020227, 1020229, 1020231, 1020236, 1020237, 1020238, 1020241, 1020257, 1020259, 1020263, 1020266, 1020271, 1020280, 1020282, 1020290, 1020291, 1020297, 1020305, 1020316, 1020324, 1020333, 1020335, 1020344, 1020359, 1020361, 1020362, 1020363, 1020373, 1020377, 1020380, 1020389, 1020391, 1020396, 1020400, 1020404, 1020410, 1020415, 1020424, 1020425, 1020430, 1020436, 1020439, 1020446, 1020450, 1020452, 1020455, 1020464, 1020466, 1020471, 1020472, 1020478, 1020484, 1020494, 1020501, 1020502, 1020512, 1020513, 1020518, 1020519, 1020528, 1020531, 1020534, 1020537, 1020540, 

In [9]:
len(querydb_lines)

101000

In [10]:
# Filtrando apenas a linhas querydb
df = df[df.index.isin(querydb_lines)]
df.reset_index(inplace=True)
del df['index']
df

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1391858028620000671,B,,15.0,20000671,1391858028,A,False,"[[-8.585676,41.148621],[-8.585694,41.148639],[..."
1,1391858298620000476,B,,57.0,20000476,1391858298,A,False,"[[-8.610768,41.145201],[-8.610786,41.145201],[..."
2,1391858562620000017,B,,28.0,20000017,1391858562,A,False,"[[-8.584317,41.163174],[-8.584776,41.163066],[..."
3,1391858622620000480,A,2591.0,,20000480,1391858622,A,False,"[[-8.675199,41.151978],[-8.675316,41.15187],[-..."
4,1391858696620000633,A,2002.0,,20000633,1391858696,A,False,"[[-8.612145,41.148693],[-8.612163,41.148747],[..."
...,...,...,...,...,...,...,...,...,...
99995,1399693710620000496,C,,,20000496,1399693710,A,False,"[[-8.680581,41.171931],[-8.679564,41.171508],[..."
99996,1399693724620000254,C,,,20000254,1399693724,A,False,"[[-8.681688,41.172777],[-8.682183,41.172966],[..."
99997,1399693724620000467,C,,,20000467,1399693724,A,False,"[[-8.680644,41.171976],[-8.680581,41.171949],[..."
99998,1399693725620000320,C,,,20000320,1399693725,A,False,"[[-8.68869,41.172876],[-8.688402,41.172804],[-..."


In [11]:
# query --> 1000 trajs par (Ta)
# dbsearch --> (as 1000 trajs ímpar da query (Ta') + 99000 trajs ímpar qualquer dos dados de teste)

In [12]:
querydb_lines[:1000]

[1020000,
 1020007,
 1020016,
 1020018,
 1020019,
 1020020,
 1020021,
 1020024,
 1020025,
 1020046,
 1020054,
 1020072,
 1020076,
 1020088,
 1020105,
 1020107,
 1020109,
 1020115,
 1020119,
 1020121,
 1020124,
 1020125,
 1020126,
 1020128,
 1020133,
 1020134,
 1020138,
 1020139,
 1020140,
 1020149,
 1020151,
 1020152,
 1020158,
 1020159,
 1020160,
 1020175,
 1020188,
 1020190,
 1020194,
 1020195,
 1020200,
 1020203,
 1020209,
 1020211,
 1020213,
 1020218,
 1020222,
 1020227,
 1020229,
 1020231,
 1020236,
 1020237,
 1020238,
 1020241,
 1020257,
 1020259,
 1020263,
 1020266,
 1020271,
 1020280,
 1020282,
 1020290,
 1020291,
 1020297,
 1020305,
 1020316,
 1020324,
 1020333,
 1020335,
 1020344,
 1020359,
 1020361,
 1020362,
 1020363,
 1020373,
 1020377,
 1020380,
 1020389,
 1020391,
 1020396,
 1020400,
 1020404,
 1020410,
 1020415,
 1020424,
 1020425,
 1020430,
 1020436,
 1020439,
 1020446,
 1020450,
 1020452,
 1020455,
 1020464,
 1020466,
 1020471,
 1020472,
 1020478,
 1020484,
 1020494,


In [13]:
querydb_lines[1000:2000] # ínicio do dbsearch começa com as Ta'!

[1020000,
 1020007,
 1020016,
 1020018,
 1020019,
 1020020,
 1020021,
 1020024,
 1020025,
 1020046,
 1020054,
 1020072,
 1020076,
 1020088,
 1020105,
 1020107,
 1020109,
 1020115,
 1020119,
 1020121,
 1020124,
 1020125,
 1020126,
 1020128,
 1020133,
 1020134,
 1020138,
 1020139,
 1020140,
 1020149,
 1020151,
 1020152,
 1020158,
 1020159,
 1020160,
 1020175,
 1020188,
 1020190,
 1020194,
 1020195,
 1020200,
 1020203,
 1020209,
 1020211,
 1020213,
 1020218,
 1020222,
 1020227,
 1020229,
 1020231,
 1020236,
 1020237,
 1020238,
 1020241,
 1020257,
 1020259,
 1020263,
 1020266,
 1020271,
 1020280,
 1020282,
 1020290,
 1020291,
 1020297,
 1020305,
 1020316,
 1020324,
 1020333,
 1020335,
 1020344,
 1020359,
 1020361,
 1020362,
 1020363,
 1020373,
 1020377,
 1020380,
 1020389,
 1020391,
 1020396,
 1020400,
 1020404,
 1020410,
 1020415,
 1020424,
 1020425,
 1020430,
 1020436,
 1020439,
 1020446,
 1020450,
 1020452,
 1020455,
 1020464,
 1020466,
 1020471,
 1020472,
 1020478,
 1020484,
 1020494,


In [14]:
def split_Tas(trajetorias):
    """
    Divide as trajetórias passadas em Ta e Ta'; Ta --> pontos(lon, lat) ímpares, Ta' --> pontos(lon, lat) pares)
    Input: Lista de trajetorias, trajetórias representadas pela sequencia de pontos (GPS)
    Output: Lista de trajetórias Ta e lista de trajetórias Ta'
    """
    Dq = []
    D_q = []
    for j in range(len(trajetorias)):
        Ta = []
        Ta_ = []
        if len(trajetorias[j]) == 1: # Se a traj tiver comprimento 1, a traj par será ela mesma!
            Ta.append(trajetorias[j][0])
            Ta_.append(trajetorias[j][0])
        else:
            for i in range(0,len(trajetorias[j]), 2): # pega os impares
                Ta.append(trajetorias[j][i])
            for i in range(1,len(trajetorias[j]), 2): # pega os pares
                Ta_.append(trajetorias[j][i])
        Dq.append(np.array(Ta))
        D_q.append(np.array(Ta_))

    return Dq, D_q

In [15]:
trajs_teste = df.POLYLINE.to_list()

In [16]:
trajs_teste

['[[-8.585676,41.148621],[-8.585694,41.148639],[-8.585721,41.148864],[-8.586135,41.14899],[-8.587161,41.149044],[-8.588574,41.149557],[-8.590383,41.150052],[-8.592345,41.150547],[-8.593281,41.150772],[-8.593272,41.150808],[-8.593254,41.150808],[-8.59356,41.150871],[-8.59401,41.150493],[-8.593767,41.149683],[-8.593335,41.148324],[-8.593083,41.14746],[-8.593056,41.147415],[-8.59302,41.147406],[-8.592993,41.147397],[-8.592966,41.147379],[-8.592939,41.147352],[-8.59293,41.147343],[-8.59293,41.147334],[-8.59293,41.147325]]',
 '[[-8.610768,41.145201],[-8.610786,41.145201],[-8.610975,41.145201],[-8.610822,41.145867],[-8.61003,41.146461],[-8.608905,41.146911],[-8.60832,41.147946],[-8.607609,41.148693],[-8.60751,41.148711],[-8.607474,41.148693],[-8.607285,41.148657],[-8.606196,41.148342],[-8.606007,41.14836],[-8.605503,41.148126],[-8.604234,41.14764],[-8.603865,41.147118],[-8.603919,41.146956],[-8.603946,41.146569],[-8.60454,41.146137],[-8.604999,41.145525],[-8.604513,41.144859],[-8.604774,41.1

In [17]:
type(trajs_teste[1])

str

In [18]:
# Convertendo de lista de strings p/ lista de listas
tmp = []
for string in trajs_teste:
    json_list = js.loads(string)
    tmp.append(json_list)

trajs_teste = tmp

In [19]:
trajs_teste

[[[-8.585676, 41.148621],
  [-8.585694, 41.148639],
  [-8.585721, 41.148864],
  [-8.586135, 41.14899],
  [-8.587161, 41.149044],
  [-8.588574, 41.149557],
  [-8.590383, 41.150052],
  [-8.592345, 41.150547],
  [-8.593281, 41.150772],
  [-8.593272, 41.150808],
  [-8.593254, 41.150808],
  [-8.59356, 41.150871],
  [-8.59401, 41.150493],
  [-8.593767, 41.149683],
  [-8.593335, 41.148324],
  [-8.593083, 41.14746],
  [-8.593056, 41.147415],
  [-8.59302, 41.147406],
  [-8.592993, 41.147397],
  [-8.592966, 41.147379],
  [-8.592939, 41.147352],
  [-8.59293, 41.147343],
  [-8.59293, 41.147334],
  [-8.59293, 41.147325]],
 [[-8.610768, 41.145201],
  [-8.610786, 41.145201],
  [-8.610975, 41.145201],
  [-8.610822, 41.145867],
  [-8.61003, 41.146461],
  [-8.608905, 41.146911],
  [-8.60832, 41.147946],
  [-8.607609, 41.148693],
  [-8.60751, 41.148711],
  [-8.607474, 41.148693],
  [-8.607285, 41.148657],
  [-8.606196, 41.148342],
  [-8.606007, 41.14836],
  [-8.605503, 41.148126],
  [-8.604234, 41.14764]

In [20]:
type(trajs_teste[1])

list

In [21]:
# Fazendo os splits (trajs par e ímpar)
query = trajs_teste[:1000] # 1000 trajetórias query
Dq, D_q = split_Tas(query)
db = trajs_teste[1000:100000] # 99_000 trajetórias do dbsearch
Dp, D_p = split_Tas(db)

In [29]:
path = '../data/'

In [30]:
def dist_haversine(p1, p2):
  """
  p1 --> numpy array [lon, lat]
  p2 --> numpy array [lon, lat]
  """
  # approximate radius of earth in km
  R = 6371.0

  lon1 = radians(p1[0]) # p1[x, ]
  lat1 = radians(p1[1]) # p1[ ,y]
  lon2 = radians(p2[0]) # p2[x, ]
  lat2 = radians(p2[1]) # p2[ ,y]

  dlon = lon2 - lon1
  dlat = lat2 - lat1

  a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
  c = 2 * atan2(sqrt(a), sqrt(1 - a))

  distance = R * c

  return distance * 1000

In [31]:
p1 = np.array([-37.989553, -4.949479], dtype=np.float32)
p2 = np.array([-37.992316, -4.9481 ], dtype=np.float32)
print(dist_haversine(p1, p2))
print(dist_haversine(p1, p2))

342.6104433639527
342.6104433639527


In [32]:
# Quanto mais próximo de zero, mais parecidas são as trajetórias!
def dist_edr_trajs(t1, t2):
    """ Edit Distance on Real Sequence"""
    """ The edit distance between two strings refers to the minimum number of character insertions, deletions, and substitutions required to change one string to the other."""

    thr = 100 # threshold of 100m

    if len(t1) > len(t2):
        difference = len(t1) - len(t2)
        for i in range(len(t2)):
          if dist_haversine(t1[i], t2[i]) > thr:
            difference += 1

    elif len(t2) > len(t1):
        difference = len(t2) - len(t1)
        for i in range(len(t1)):
          if dist_haversine(t1[i], t2[i]) > thr:
            difference += 1

    else:
        difference = 0
        for i in range(len(t1)):
          if dist_haversine(t1[i], t2[i]) > thr:
            difference += 1


    return difference

In [33]:
t1 =  np.array([[-37.97151, -4.93422],
                [-37.971341, -4.934051],
                [-37.971111, -4.933850],
                [-37.970728, -4.933544],
                [-37.970429, -4.933256],
                [-37.970207, -4.933063]], dtype=np.float32)

t2 =  np.array([[-37.97135, -4.93438],
                [-37.971143, -4.934198],
                [-37.970769, -4.933945],
                [-37.969606, -4.934348]], dtype=np.float32)

In [34]:
print(dist_haversine(t1[0], t2[0]))
print(dist_haversine(t1[1], t2[1]))
print(dist_haversine(t1[2], t2[2]))
print(dist_haversine(t1[3], t2[3]))

25.14806588309025
27.37899657042259
39.484935001738116
153.40645686093765


In [35]:
dist_edr_trajs(t1, t2)

3

In [36]:
Dq[0]

array([[-8.585676, 41.148621],
       [-8.585721, 41.148864],
       [-8.587161, 41.149044],
       [-8.590383, 41.150052],
       [-8.593281, 41.150772],
       [-8.593254, 41.150808],
       [-8.59401 , 41.150493],
       [-8.593335, 41.148324],
       [-8.593056, 41.147415],
       [-8.592993, 41.147397],
       [-8.592939, 41.147352],
       [-8.59293 , 41.147334]])

In [37]:
D_q[0]

array([[-8.585694, 41.148639],
       [-8.586135, 41.14899 ],
       [-8.588574, 41.149557],
       [-8.592345, 41.150547],
       [-8.593272, 41.150808],
       [-8.59356 , 41.150871],
       [-8.593767, 41.149683],
       [-8.593083, 41.14746 ],
       [-8.59302 , 41.147406],
       [-8.592966, 41.147379],
       [-8.59293 , 41.147343],
       [-8.59293 , 41.147325]])

In [38]:
D_p[0]

array([[-8.620002, 41.147901],
       [-8.620308, 41.147343],
       [-8.620146, 41.146308],
       [-8.620155, 41.14629 ],
       [-8.618013, 41.145975],
       [-8.616114, 41.145246],
       [-8.613639, 41.145894],
       [-8.611974, 41.146002],
       [-8.61093 , 41.145723],
       [-8.61102 , 41.145021],
       [-8.612577, 41.143869],
       [-8.613864, 41.143023],
       [-8.615214, 41.142087],
       [-8.615097, 41.140836],
       [-8.613495, 41.141286]])

In [39]:
dist_haversine(Dq[0][0], D_q[0][0])

2.505499396767574

In [41]:
# Salvando as trajs points em arquivos para semrem usadas nas demais técnicas...

# Salvando Dq:
with open(path+'Dq-pts.pickle', 'wb') as file:
    pickle.dump(Dq, file)

# Salvando D_q:
with open(path+'D_q-pts.pickle', 'wb') as file:
    pickle.dump(D_q, file)

# Salvando D_p:
with open(path+'D_p-pts.pickle', 'wb') as file:
    pickle.dump(D_p, file)

In [42]:
def rank(t_query, i, db_search):
    dists = list(map(lambda t: dist_edr_trajs(t_query, t), db_search)) # pega todas as distâncias de Dq[i] as trajs de D_qUD_p
    dists = np.array(dists)
    order = dists.argsort() # pega a ordem
    ranks = order.argsort() # pega o rank

    return ranks[i] + 1 # retorna o ranking de ta' no dbsearch | soma +1 pq o argsort rankea a partir de 0 (zero)

In [43]:
def acc(ranks):
    count = 0
    for i in range(len(ranks)):
        if ranks[i] == 1:
            count += 1

    return round(count/len(ranks), 3)

In [44]:
def mr(ranks):
    return (sum(ranks)/len(ranks))

In [45]:
def mrr(ranks):
    count = 0
    for i in range(len(ranks)):
        count += 1/ranks[i]

    return round(count/len(ranks), 3)

In [46]:
# Intervalo de Confiança do Ranks
def cip_r(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(ranks[i]) # Add os Ranks
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [47]:
# Intervalo de Confiança dos Reciprocal Ranks
def cip_rr(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(1/ranks[i]) # Add os RRs...
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [48]:
%%time
dbsizes = [20000, 40000, 60000, 80000, 100000]
entire_db = D_q + D_p
for dbsize in dbsizes:
    ranks = []
    dbsearch = entire_db[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(Dq[i], i, dbsearch)) # rank de ta' no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 23.526(1.495, 45.557), Acc: 0.704, MRR: 0.794(0.774, 0.815) with dbsearch size: 20000
Mean rank: 43.047(4.362, 81.732), Acc: 0.634, MRR: 0.734(0.711, 0.757) with dbsearch size: 40000
Mean rank: 71.014(2.627, 139.401), Acc: 0.57, MRR: 0.682(0.658, 0.706) with dbsearch size: 60000
Mean rank: 91.094(5.041, 177.147), Acc: 0.557, MRR: 0.659(0.635, 0.684) with dbsearch size: 80000
Mean rank: 120.04(6.607, 233.473), Acc: 0.531, MRR: 0.636(0.611, 0.662) with dbsearch size: 100000
CPU times: user 1h 52min 26s, sys: 132 ms, total: 1h 52min 26s
Wall time: 2h 4min 31s
