In [None]:
# Allowed to make changes.
import collections
import json
import pandas as pd
import re
import string
import timeit
from ast import literal_eval

# Setup Elastic Search Server
This must be set up before we connect to gdrive for the accessing the trained weights.
Note that if the server is interrupted in between while retriever is using, it must be restarted 

In [None]:
!pip3 install -q git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss]

In [None]:
import pandas as pd
import numpy as np
import csv
import time
import pickle

In [None]:
%cd /content/

/content


In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

Start the Elastic Search Server

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
time.sleep(30)

Haystack provides convenient way to store the documents and do inference but with our own models


In [None]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document")

# Load Classifier
Model that uses the bm25, ranker and reader scores to determine the answerability of the question

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Change to the appropriate directory in GDrive containing all the model weights 

In [None]:
%cd /content/drive/MyDrive/DevRev

/content/drive/MyDrive/DevRev


In [None]:
mlp = pickle.load(open('classifier_model.sav', 'rb'))

# Training the Num Pass Predictor
Model to predict the probobility that a passage is in the top K_i rank based on ranker and retreiver scores

In [None]:
lr = pickle.load(open('num_pass_model.sav', 'rb'))

# Read and store the Documents

In [None]:
df_para = pd.read_csv("paragraphs.csv", index_col="id")

In [None]:
docs = []
for index, row in df_para.iterrows():
  docs.append({'content': row["paragraph"],
                'meta': {'name': row["theme"]},
                'id': index})

In [None]:
document_store.delete_documents()
document_store.write_documents(docs)

# Load the Domain Adapted Ranker
Load the rankers fine-tuned using knowledge distillation of specific themes (theme_rankers) as well as all the training data (universal ranker)

In [None]:
from haystack.nodes import SentenceTransformersRanker

In [None]:
universal_ranker = SentenceTransformersRanker(model_name_or_path='minilml2_mse_13')
theme_rankers = {"IPod": SentenceTransformersRanker(model_name_or_path='minilml2_mse_12_ipod'), 
                 "2008_Sichuan_earthquake": SentenceTransformersRanker(model_name_or_path='minilml2_mse_12_se')}

In [None]:
def get_theme_model(theme):
  if (theme in theme_rankers.keys()):
    return theme_rankers[theme]
  return universal_ranker

# Load the Reader Model

In [None]:
from haystack.nodes import FARMReader
reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", use_gpu=True, progress_bar=False)

Downloading (…)lve/main/config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
AVG_READER_TIME = 0.54 #ms

# Load the BM25 Retriever with Elastic Search

In [None]:
from haystack.nodes import BM25Retriever
retriever = BM25Retriever(document_store=document_store)

# Heuristic For Determining the Number of Passages to be Passed based on Probobaility 

In [None]:
import heapq
import math
import random

def get_num_pass(top_k_probs, avg_feed):
  slots_taken = 0
  slots_total = math.ceil(avg_feed*len(top_k_probs))

  z = [0]*len(top_k_probs)
  min_heap = []
  expected_value = 0
  for i in range(len(top_k_probs)):
    slots_taken += 1
    expected_value += top_k_probs[i][0]
    z[i] = 1
    heapq.heappush(min_heap, (top_k_probs[i][0] - top_k_probs[i][1], i))

  while slots_taken < slots_total:
    upd_val, id = heapq.heappop(min_heap)
    upd_val *= -1
    z[id] += 1
    slots_taken += 1
    expected_value += upd_val
    if z[id] < 5:
      heapq.heappush(min_heap, (top_k_probs[id][z[id]-1] - top_k_probs[id][z[id]], id))

  min_heap = []

  for i in range(len(top_k_probs)):
    diff = 0
    if z[i] > 1:
      diff = top_k_probs[i][z[i]-1] - top_k_probs[i][z[i]-2]
    else:
      diff = top_k_probs[i][z[i]-1]
    heapq.heappush(min_heap, (diff, i, z[i]))

  random_steps = 25*len(top_k_probs)
  steps_taken = 0
  while steps_taken < random_steps:
    steps_taken += 1
    id = random.randint(0, len(top_k_probs)-1)

    if z[id] < 5:
      z[id] += 1
      diff = 0
      if z[id] > 1:
        diff = top_k_probs[id][z[id]-1] - top_k_probs[id][z[id]-2]
      else:
        diff = top_k_probs[id][z[id]-1]
      expected_value += diff
      heapq.heappush(min_heap, (diff, id, z[id]))

      ra, rb, rc = heapq.heappop(min_heap)
      while z[rb] != rc:
        ra, rb, rc = heapq.heappop(min_heap)
      
      expected_value -= ra
      z[rb] -= 1
      diff = 0
      if z[rb] > 1:
        diff = top_k_probs[rb][z[rb]-1] - top_k_probs[rb][z[rb]-2]
      elif z[rb] == 1:
        diff = top_k_probs[rb][z[rb]-1]
      
      if z[rb] != 0:
        heapq.heappush(min_heap, (diff, rb, z[rb]))

  return z

# Evaluation Loop

In [None]:
from haystack import Document as document
import time
from tqdm import tqdm

In [None]:
# Allowed to make changes.
def pred_theme_ans(questions, theme_model, pred_out):
  theme = questions[0]["theme"]
  ranker = theme_model
  X = []
  y = []
  
  st1 = time.time()

  for question in questions:
    #-------------------------------------   
    # add your prediction methodology here.
    #-------------------------------------

    sample_res = retriever.retrieve(
      query = question["question"],
      top_k = 5,
      filters = {"name": [theme]}
    )

    filtered_docs = [document.from_dict({'content': res.content, 'meta': {'name': theme}, 'id': res.id}) for res in sample_res]

    if (len(filtered_docs) == 0):
      X.append(np.zeros((5, 2)))
      y.append(np.full((5, ), -1))
      continue

    sample_res_2 = ranker.predict(
        query = question["question"],
        top_k = 5,
        documents = filtered_docs[:5]
    )
  
    scores = {}
    para_ids = {}
    for res in sample_res_2:
      scores[res.id] = [[0, res.score]]
      para_ids[res.id] = res.id 
    for res in sample_res:
      scores[res.id][0][0] = res.score

    _X = np.concatenate(list(scores.values()))
    _X = np.pad(_X, [(0, 5 - _X.shape[0]),(0, 0)], "constant")
    _y = np.array(list(para_ids.values()))
    _y = np.pad(_y, (0, 5 - _y.shape[0]), "constant", constant_values=-1)
    
    X.append(_X)
    y.append(_y)

  # Contains the reader and retreiver scores of the top 5 flattened
  X_flatten = np.array(X).reshape(-1, 10)

  # Predict the probability that the answer is in the top k_i
  probs = lr.predict(X_flatten)
  et1 = time.time()
  
  # Time remaining per question
  time_rem = (len(questions) - (et1 - st1))/len(questions)

  # Apply heuristic to find the number of passes needed based on the probability distribution 
  num_pass = get_num_pass(probs, time_rem/AVG_READER_TIME)

  for idx in tqdm(range(len(questions))):
    question = questions[idx]
    ans = {}

    doc_list = []
    # Predict the best answer among the top n decided by the num_pass
    for i in range(num_pass[idx]):
      if y[idx][i] == -1:
        continue
      doc_list.append(document_store.get_document_by_id(id=y[idx][i]))

    if len(doc_list) == 0:
      ans["paragraph_id"] = -1
      ans["answers"] = ""
      pred_out.append(ans)
      continue

    results = reader.predict(question["question"], doc_list, top_k=1)

    if len(results["answers"]) == 0:
      ans["paragraph_id"] = -1
      ans["answers"] = ""
      pred_out.append(ans)
      continue

    X_input = [results["answers"][0].score]
    X_input = np.array([np.concatenate([X_input, X[idx][:,1]])])

    # Predict the answerability based on reader and ranker scores 
    answerability = mlp.predict(X_input)

    ans["question_id"] = question["id"]
    
    if not answerability[0]:
      ans["paragraph_id"] = -1
      ans["answers"] = ""
    else:
      ans["paragraph_id"] = results["answers"][0].document_id
      ans["answers"] = results["answers"][0].answer

    pred_out.append(ans)

In [None]:
# NOT allowed to make changes. 

# All theme prediction.
questions = json.loads(pd.read_csv("sample_input_question_1.csv").to_json(orient="records"))
theme_intervals = json.loads(pd.read_csv("sample_theme_interval_1.csv").to_json(orient="records"))
pred_out = []
theme_inf_time = {}
for theme_interval in theme_intervals:
  theme_ques = questions[int(theme_interval["start"]) - 1: int(theme_interval["end"])]
  theme = theme_ques[0]["theme"]
  # Load model fine-tuned for this theme.
  theme_model = get_theme_model(theme)
  execution_time = timeit.timeit(lambda: pred_theme_ans(theme_ques, theme_model, pred_out), number=1)
  theme_inf_time[theme_interval["theme"]] = execution_time * 1000 # in milliseconds.
pred_df = pd.DataFrame.from_records(pred_out)
pred_df.fillna(value='', inplace=True)
# Write prediction to a CSV file. Teams are required to submit this csv file.
pred_df.to_csv('sample_output_prediction.csv', index=False)

In [None]:
# NOT allowed to make changes. 

def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()

def calc_f1(a_gold, a_pred):
  gold_toks = get_tokens(a_gold)
  pred_toks = get_tokens(a_pred)
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)
  if num_same == 0:
    return 0
  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

def calc_max_f1(predicted, ground_truths):
  max_f1 = 0
  if len(ground_truths) == 0:
    return len(predicted) == 0
  for ground_truth in ground_truths:
    f1 = calc_f1(predicted, ground_truth)
    max_f1 = max(max_f1, f1)
  return max_f1

In [None]:
# NOT allowed to make changes. 

# Evaluation methodology.
metrics = {}
pred = pd.read_csv("sample_output_prediction.csv")
pred.fillna(value='', inplace=True)
truth = pd.read_csv("sample_ground_truth.csv")
truth.fillna(value='', inplace=True)
truth.paragraph_id = truth.paragraph_id.apply(literal_eval)
truth.answers = truth.answers.apply(literal_eval)
questions = pd.read_csv("sample_input_question.csv")
for idx in pred.index:
  q_id = pred["question_id"][idx]
  q_rows = questions.loc[questions['id'] == q_id].iloc[-1]
  theme = q_rows["theme"]
  predicted_paragraph = pred["paragraph_id"][idx]
  predicted_ans = pred["answers"][idx]
  
  if theme not in metrics.keys():
    metrics[theme] = {"true_positive": 0, "true_negative": 0, "total_predictions": 0, "f1_sum": 0}

  truth_row = truth.loc[truth['question_id'] == q_id].iloc[-1]
  truth_paragraph_id = [ int(i) for i in truth_row["paragraph_id"] ]
  if predicted_paragraph in truth_paragraph_id:
    # Increase TP for that theme.
    metrics[theme]["true_positive"] = metrics[theme]["true_positive"] + 1
  # -1 prediction in case there is no paragraph which can answer the query.
  if predicted_paragraph == -1 and truth_row["paragraph_id"] == []:
    # Increase TN.
    metrics[theme]["true_negative"] = metrics[theme]["true_negative"] + 1
  # Increase total predictions for that theme.
  metrics[theme]["total_predictions"] = metrics[theme]["total_predictions"] + 1
  f1 = calc_max_f1(predicted_ans, truth_row["answers"])
  metrics[theme]["f1_sum"] = metrics[theme]["f1_sum"] + f1

In [None]:
# NOT allowed to make changes.

# Final score.
inf_time_threshold = 1000.0 # milliseconds.
final_para_score = 0.0
final_qa_score = 0.0
# Weight would stay hidden from teams.
# theme_weights = {"Kubernetes": 0.5, "ChatGPT": 0.4, "Football world cup": 0.1}
# theme_weights = {"The_Legend_of_Zelda:_Twilight_Princess":1}
for theme in metrics:
  inf_time_score = 1.0
  metric = metrics[theme]
  para_score = (metric["true_positive"] + metric["true_negative"]) / metric["total_predictions"] 
  qa_score = metric["f1_sum"] / metric["total_predictions"]
  avg_inf_time = theme_inf_time[theme] / metric["total_predictions"]
  if avg_inf_time > inf_time_threshold:
    inf_time_score = inf_time_threshold / avg_inf_time
  final_qa_score += theme_weights[theme] * inf_time_score * qa_score
  final_para_score += theme_weights[theme] * inf_time_score * para_score

print (final_para_score)
print (final_qa_score)