In [1]:
# prediction Code

import Levenshtein as ln


def predict_query(query, G, roots_list, ids_question_map, track_leafs):
    tokenized_query = query.split()
    len_tokenized_query = len(tokenized_query)
    node_list = []

    def recurssive_search(comparison_term, G, idx=1, root=False, node_list=[], forked=False):
        new_term = []
        for node in (roots_list if root else G[" ".join(comparison_term.split()[:-1])]):

            # don't auto-correct on last-word
            if ((ln.distance(comparison_term, node) <= 2) if idx != len_tokenized_query - 1 else tokenized_query[
                                                                                                     -1] in node):
                if idx != len_tokenized_query - 1:
                    node_list.extend(recurssive_search(node + " " + tokenized_query[idx + 1], G, idx + 1, root=False,
                                                       node_list=node_list, forked=True))
                else:
                    if node in G and node in track_leafs:
                        new_term.append(node)

        if forked:
            if idx == len_tokenized_query - 1:
                return [ids_question_map[child] for term in new_term for child in track_leafs[term]]

            return ""
        else:
            if len(tokenized_query) == 1:
                if idx == len_tokenized_query - 1:
                    return [ids_question_map[child] for term in new_term for child in track_leafs[term]]
            return node_list

    res = recurssive_search(tokenized_query[0], G, idx=0, root=True)
    return res

# Version 1 Evaluation: 

In [3]:
# Load files

import pandas as pd
import Levenshtein as ln
import networkx as nx
from ast import literal_eval

from tqdm import tqdm
import pickle
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')

In [14]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tensorflow_text

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')

tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])


embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

def get_embedding(text):
    if isinstance(text, str):
        return embed([text]).numpy()
    return embed(text).numpy()

get_embedding("Hello World").shape

2021-07-18 18:41:10.412006: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-18 18:41:10.412185: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-07-18 18:41:10.412852: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1080 Ti computeCapability: 6.1
coreClock: 1.62GHz coreCount: 28 deviceMemorySize: 10.91GiB deviceMemoryBandwidth: 451.17GiB/s
2021-07-18 18:41:10.412914: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-07-18 18:41:10.413535: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 1 with properties: 
pciBusID: 0000:0

(1, 512)

In [4]:
G = nx.read_gpickle("../index/query_graph.gpickle")
track_leafs = pickle.load(open("../index/track_leafs.pkl", "rb"))
roots_list = [i for i,j in G.nodes(data="starts", default=1) if j==True]

ids_question_map = pickle.load(open("../index/ids_question_map.pkl", "rb"))
question_ids_map = pickle.load(open("../index/question_ids_map.pkl", "rb"))

In [5]:
# these functions are heavily influenced by the HF squad_metrics.py script
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)


In [6]:
import csv

limit = 1000
metrics = {
    "EM": {
        "@3": [],
        "@5": [],
        "@10": []
    },
    "F1": {
        "@3": [],
        "@5": [],
        "@10": []
    }
}
with open("../data/queries.dev.tsv", "r") as f:
    tsv_reader = csv.reader(f, delimiter="\t")
    nlines = 0 
    for row in tqdm(tsv_reader):
        nlines += 1
        line = row[-1].lower()
        if line!='':
            question = line
            tokenized_question = line.split()
            doc = nlp(question)
            np_list = [str(token) for token in doc if token.tag_[:2]=="NN"]
            observed_np = False
            for token_idx in range(1, len(tokenized_question)):
                if observed_np or tokenized_question[token_idx-1] in np_list:
                    observed_np = True
                    suggested_queries = predict_query(" ".join(tokenized_question[:token_idx]), G, roots_list, ids_question_map, track_leafs)[:500]

                    if suggested_queries!=[]:
                        for k in [3, 5, 10]:
                            temp_EM, temp_F1 = [0], [0]
                            for q in suggested_queries[:k]:
                                temp_EM.append(compute_exact_match(q, question))
                                temp_F1.append(compute_f1(q, question))
                            metrics['EM']['@'+str(k)] = metrics['EM']['@'+str(k)] + [max(temp_EM)]
                            metrics['F1']['@'+str(k)] = metrics['F1']['@'+str(k)] + [max(temp_F1)]
        if not limit:
            break
        else:
            limit-=1



1000it [00:34, 28.72it/s]


In [13]:
# query = "what is"
# suggested_queries = predict_query("what is", G, roots_list, ids_question_map, track_leafs)[:500]
# sorted(suggested_queries + [query])[:10]

In [7]:
#metrics
print("Average Exact Match Metric: ", "\n", "EM@3: "+str(np.average(metrics['EM']['@3'])), "\n", "EM@5: "+str(np.average(metrics['EM']['@5'])), "\n", "EM@10: "+str(np.average(metrics['EM']['@10'])))
print("--------")
print("Average F1 Metric: ", "\n", "F1@3: "+str(np.average(metrics['F1']['@3'])), "\n", "F1@5: "+str(np.average(metrics['F1']['@5'])), "\n", "F1@10: "+str(np.average(metrics['F1']['@10'])))

Average Exact Match Metric:  
 EM@3: 0.009933774834437087 
 EM@5: 0.011037527593818985 
 EM@10: 0.012141280353200883
--------
Average F1 Metric:  
 F1@3: 0.4550674975638583 
 F1@5: 0.47319383050217734 
 F1@10: 0.4945249375231127


In [7]:
## Without checking for NP
# Average Exact Match Metric:  
#  EM@3: 0.0040885860306643955 
#  EM@5: 0.004429301533219762 
#  EM@10: 0.005792163543441226
# --------
# Average F1 Metric:  
#  F1@3: 0.3896572010693722 
#  F1@5: 0.4050902330327353 
#  F1@10: 0.4225743728587715

Average Exact Match Metric:  
 EM@3: 0.0040885860306643955 
 EM@5: 0.004429301533219762 
 EM@10: 0.005792163543441226
--------
Average F1 Metric:  
 F1@3: 0.3896572010693722 
 F1@5: 0.4050902330327353 
 F1@10: 0.4225743728587715


# v2

In [8]:
import pandas as pd
import Levenshtein as ln
import networkx as nx
from ast import literal_eval

from tqdm import tqdm
import pickle
import numpy as np



In [9]:
G = nx.read_gpickle("../index/v2/query_graph.gpickle")
track_leafs = pickle.load(open("../index/v2/track_leafs.pkl", "rb"))
ids_question_map = pickle.load(open("../index/v2/ids_question_map.pkl", "rb"))
question_ids_map = pickle.load(open("../index/v2/question_ids_map.pkl", "rb"))

roots_list = [i for i,j in G.nodes(data="starts", default=1) if j==True]

In [10]:
import csv

#limit = 15140
limit = 1000
metrics = {
    "EM": {
        "@3": [],
        "@5": [],
        "@10": []
    },
    "F1": {
        "@3": [],
        "@5": [],
        "@10": []
    }
}
with open("../data/queries.dev.tsv", "r") as f:
    tsv_reader = csv.reader(f, delimiter="\t")
    nlines = 0 
    for row in tqdm(tsv_reader):
        nlines += 1
        line = row[-1].lower()
        if line!='':
            question = line
            tokenized_question = line.split()
            doc = nlp(question)
            np_list = [str(token) for token in doc if token.tag_[:2]=="NN"]
            observed_np = False
            for token_idx in range(1, len(tokenized_question)):
                if observed_np or tokenized_question[token_idx-1] in np_list:
                    observed_np = True
                    suggested_queries = predict_query(" ".join(tokenized_question[:token_idx]), G, roots_list, ids_question_map, track_leafs)[:500]

                    if suggested_queries!=[]:
                        for k in [3, 5, 10]:
                            temp_EM, temp_F1 = [0], [0]
                            for q in suggested_queries[:k]:
                                temp_EM.append(compute_exact_match(q, question))
                                temp_F1.append(compute_f1(q, question))
                            metrics['EM']['@'+str(k)] = metrics['EM']['@'+str(k)] + [max(temp_EM)]
                            metrics['F1']['@'+str(k)] = metrics['F1']['@'+str(k)] + [max(temp_F1)]
        if not limit:
            break
        else:
            limit-=1

1000it [01:05, 15.32it/s]


In [11]:
#metrics
print("Average Exact Match Metric: ", "\n", "EM@3: "+str(np.average(metrics['EM']['@3'])), "\n", "EM@5: "+str(np.average(metrics['EM']['@5'])), "\n", "EM@10: "+str(np.average(metrics['EM']['@10'])))
print("--------")
print("Average F1 Metric: ", "\n", "F1@3: "+str(np.average(metrics['F1']['@3'])), "\n", "F1@5: "+str(np.average(metrics['F1']['@5'])), "\n", "F1@10: "+str(np.average(metrics['F1']['@10'])))

Average Exact Match Metric:  
 EM@3: 0.009149130832570906 
 EM@5: 0.010064043915827997 
 EM@10: 0.011893870082342177
--------
Average F1 Metric:  
 F1@3: 0.4879684855782274 
 F1@5: 0.5054822473192558 
 F1@10: 0.5287107502350595


In [17]:
## Without checking for NP
# Average Exact Match Metric:  
#  EM@3: 0.004072681704260651 
#  EM@5: 0.0043859649122807015 
#  EM@10: 0.005639097744360902
# --------
# Average F1 Metric:  
#  F1@3: 0.40197607565187854 
#  F1@5: 0.4187706908725181 
#  F1@10: 0.4385090217213102

Average Exact Match Metric:  
 EM@3: 0.004072681704260651 
 EM@5: 0.0043859649122807015 
 EM@10: 0.005639097744360902
--------
Average F1 Metric:  
 F1@3: 0.40197607565187854 
 F1@5: 0.4187706908725181 
 F1@10: 0.4385090217213102


# v2 + Ranked

In [12]:
import faiss

X = np.load(open("../index/v2/user_queries.npy", "rb"))

D = 512
K = 10
kmeans = faiss.Kmeans(d=D, k=round(16*(X.shape[0]**(1/2))), niter=20, verbose=True, gpu=True)

kmeans.train(X.astype(np.float32))



Clustering 50001 points in 512D to 3578 clusters, redo 1 times, 20 iterations
  Preprocessing in 0.01 s
  Iteration 19 (0.78 s, search 0.56 s): objective=24707.2 imbalance=1.461 nsplit=0       

24707.16015625

In [15]:
import csv
from sklearn.metrics.pairwise import cosine_similarity

#limit = 15140
limit = 1000
metrics = {
    "EM": {
        "@3": [],
        "@5": [],
        "@10": []
    },
    "F1": {
        "@3": [],
        "@5": [],
        "@10": []
    }
}
with open("../data/queries.dev.tsv", "r") as f:
    tsv_reader = csv.reader(f, delimiter="\t")
    nlines = 0 
    for row in tqdm(tsv_reader):
        nlines += 1
        line = row[-1].lower()
        if line!='':
            question = line
            tokenized_question = line.split()
            if tokenized_question!=[]:
                query_embed = get_embedding(question)
                doc = nlp(question)
                np_list = [str(token) for token in doc if token.tag_[:2]=="NN"]
                observed_np = False
                for token_idx in range(1, len(tokenized_question)):
                    if observed_np or tokenized_question[token_idx-1] in np_list:
                        observed_np = True
                        suggested_queries = predict_query(" ".join(tokenized_question[:token_idx]), G, roots_list, ids_question_map, track_leafs)[:500]

                        if suggested_queries!=[]:
                            test_x = get_embedding(suggested_queries)
                            sorted_res = np.argsort(np.max(cosine_similarity(test_x, kmeans.centroids + query_embed), axis=-1))[::-1]
                            for k in [3, 5, 10]:
                                temp_EM, temp_F1 = [0], [0]
                                filtered_queries = [suggested_queries[tag] for tag in sorted_res[:k]]
                                for q in filtered_queries[:k]:
                                    temp_EM.append(compute_exact_match(q, question))
                                    temp_F1.append(compute_f1(q, question))
                                metrics['EM']['@'+str(k)] = metrics['EM']['@'+str(k)] + [max(temp_EM)]
                                metrics['F1']['@'+str(k)] = metrics['F1']['@'+str(k)] + [max(temp_F1)]
        if not limit:
            break
        else:
            limit-=1

1000it [02:52,  5.81it/s]


In [17]:
kmeans.index

<faiss.swigfaiss_avx2.IndexReplicas; proxy of <Swig Object of type 'faiss::IndexReplicasTemplate< faiss::Index > *' at 0x7fbda6f050f0> >

In [16]:
#metrics
print("Average Exact Match Metric: ", "\n", "EM@3: "+str(np.average(metrics['EM']['@3'])), "\n", "EM@5: "+str(np.average(metrics['EM']['@5'])), "\n", "EM@10: "+str(np.average(metrics['EM']['@10'])))
print("--------")
print("Average F1 Metric: ", "\n", "F1@3: "+str(np.average(metrics['F1']['@3'])), "\n", "F1@5: "+str(np.average(metrics['F1']['@5'])), "\n", "F1@10: "+str(np.average(metrics['F1']['@10'])))

Average Exact Match Metric:  
 EM@3: 0.013723696248856358 
 EM@5: 0.013723696248856358 
 EM@10: 0.013723696248856358
--------
Average F1 Metric:  
 F1@3: 0.5759839235621964 
 F1@5: 0.58476066024534 
 F1@10: 0.5911137329097126


In [25]:
## Without checking for NP
# Average Exact Match Metric:  
#  EM@3: 0.007832080200501253 
#  EM@5: 0.007832080200501253 
#  EM@10: 0.007832080200501253
# --------
# Average F1 Metric:  
#  F1@3: 0.4952691105325539 
#  F1@5: 0.5116865949954855 
#  F1@10: 0.5286050616505952

Average Exact Match Metric:  
 EM@3: 0.007832080200501253 
 EM@5: 0.007832080200501253 
 EM@10: 0.007832080200501253
--------
Average F1 Metric:  
 F1@3: 0.4952691105325539 
 F1@5: 0.5116865949954855 
 F1@10: 0.5286050616505952
