<a href="https://colab.research.google.com/github/LilInformat/similar-questions/blob/master/similar_questions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import auth
auth.authenticate_user()

In [0]:
import os
import gzip
import time
import json
import re
import numpy as np
import pandas as pd
import random
from pandas.io.json import json_normalize
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split

In [0]:
pd.set_option('display.max_colwidth', -1)
np.set_printoptions(threshold=np.inf)

In [0]:
# Download the file from a given Google Cloud Storage bucket.
DOWNLOAD_PATH = '/content/NLTK/dataset'
if not os.path.exists(DOWNLOAD_PATH):
    os.makedirs(DOWNLOAD_PATH)
    
!gsutil -m cp -R gs://natural_questions/v1.0 /content/NLTK/dataset

In [0]:
ENCODER_PATH = '/content/google/universal-sentence-encoder/module'
if not os.path.exists(ENCODER_PATH):
    os.makedirs(ENCODER_PATH)
    
# Download the module, and uncompress it to the destination folder. 
!curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/3?tf-hub-format=compressed" | tar -zxvC /content/google/universal-sentence-encoder/module

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
./
./tfhub_module.pb
./variables/
./variables/variables.data-00000-of-00001
 97  745M   97  726M    0     0  74.7M      0  0:00:09  0:00:09 --:--:-- 77.5M./variables/variables.index
./assets/
./saved_model.pb
100  745M  100  745M    0     0  74.6M      0  0:00:09  0:00:09 --:--:-- 77.2M


In [0]:
DEV_PATH = '/content/NLTK/dataset/v1.0/dev'
DEV_FILES = [
             os.path.join(DEV_PATH,'nq-dev-00.jsonl.gz'),
             os.path.join(DEV_PATH,'nq-dev-01.jsonl.gz'),
             os.path.join(DEV_PATH,'nq-dev-02.jsonl.gz'),
             os.path.join(DEV_PATH,'nq-dev-03.jsonl.gz'),
             os.path.join(DEV_PATH,'nq-dev-04.jsonl.gz')
             ]

def get_answers(data_to_process):
  answers = []
  tokens = []
  processed_candidates =[]
  for annotation in data_to_process['annotations'][0]:
    yes_no_answer = annotation['yes_no_answer']
    if yes_no_answer != "NONE":
      answers.append(yes_no_answer)
      continue;

    long_answer = annotation['long_answer']
    candidate_index = int(long_answer['candidate_index'])
    if candidate_index != -1 and candidate_index not in processed_candidates:
      start = int(long_answer["start_token"])
      end = int(long_answer["end_token"])
      processed_candidates.append(candidate_index)
      candidate_tokens = [x['token'] for x in data_to_process['document_tokens'][0][start:end]]
      answer = ' '.join(x for x in candidate_tokens)
      tokens.append(candidate_tokens)
      answers.append(''.join(answer))

  if len(answers) != 0:
    return answers, tokens
  return None, None

def load_data(files = DEV_FILES):
  """Loads an array of files and outputs a single dataframe
  Args:
    files: an array of files
    gzipped: if the files are gzipped
  Returns:
    A dataframe
  """
  df = pd.DataFrame(columns=["question_text", "question_tokens","answers"])
  for f in files:
    with gzip.open(f) as fin:
      for l in fin:
        json_line = l.decode("utf8", "strict")
        json_data = json.loads(json_line)
        normalized_data = json_normalize(json_data)
        answers_text, answer_tokens = get_answers(normalized_data)
        df = df.append({
          "question_text": normalized_data['question_text'][0],
          "question_tokens": normalized_data['question_tokens'][0],
          "answers": answers_text,
          "answer_tokens": answer_tokens
            }, ignore_index=True)
  return df

def getKNearestResults(index, sim_matrix, k):
  top_indices = np.argpartition(sim_matrix[index], -k)[-k:]
  result = [sim_matrix[index][i] for i in top_indices]
  return sorted(zip(top_indices,result))

In [0]:
data = load_data(DEV_FILES)
#unprocessed_data = unprocessed_load_data([DEV_FILES[0]])

In [0]:
data.isnull().sum()
clean_data = data.dropna()
print(data.shape)
print(clean_data.shape)

(7830, 4)
(5499, 4)


In [0]:
X_train, X_test = train_test_split(clean_data, test_size=0.33, random_state=42)

In [0]:
questions_train = X_train['question_text'].tolist()
questions_test = X_test['question_text'].tolist()

with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  questions_train_embed = session.run(embed(questions_train))
  questions_test_embed = session.run(embed(questions_test))
  np.inner(questions_test_embed, questions_train_embed)
  sim_matrix = np.inner(questions_test_embed, questions_train_embed)

In [0]:
print(sim_matrix.shape)
print(questions_train.shape)
print(questions_test.shape)

(1815, 3684)
(3684,)
(1815,)


In [280]:
print("Test Question: %s"%questions_test[0])
results = getKNearestResults(0, sim_matrix, 4)
for result in results:
  print("Question: %s" %questions_train[result[0]])
  print("Score: %s" %result[1])

Test Question: a player that can shoot pass or dribble is considered to be
Question: what nba player has scored the most 3 pointers
Score: 0.7008544
Question: most assists in an nba all star game
Score: 0.69767153
Question: who scored the most points in a single game in the nba
Score: 0.67270887
Question: who is the all time leading scorer in ncaa tournament history
Score: 0.6549603
