In [11]:
import json
from tqdm import tqdm
import numpy as np
with open("./question.json") as fin:
    questions = json.load(fin)

with open("./textbook.json") as fin:
    paragraphs = json.load(fin)

#load paragraphs_embeddings_ada.npy
paragraphs_embeddings = np.load("./example_embedding/paragraphs_embeddings_ada.npy")
questions_embeddings = np.load("./example_embedding/questions_embeddings_ada.npy")
print(paragraphs_embeddings.shape)
print(questions_embeddings.shape)


(2205, 1536)
(3005, 1536)


In [3]:
#similarity retrieval
def similarity_retrieval(query_embedding, embedding_matrix, topk=5):
    similarity = np.dot(embedding_matrix, query_embedding)
    similarity = similarity / (np.linalg.norm(embedding_matrix, axis=1) * np.linalg.norm(query_embedding))
    topk_index = np.argsort(similarity)[::-1][:topk]
    return topk_index

#calculate the rank of cosine similarity between the query embedding and the real source paragraph
def similarity_rank_calculation(query_embedding, embedding_matrix, ground_truth_index = 10):
    similarity = np.dot(embedding_matrix, query_embedding)
    similarity = similarity / (np.linalg.norm(embedding_matrix, axis=1) * np.linalg.norm(query_embedding))
    rank = np.argsort(similarity)[::-1]
    return np.where(rank == ground_truth_index)[0][0]

In [4]:
ranks = []
for i in tqdm(range(len(questions))):
    query_embedding = questions_embeddings[i]
    ground_truth_index = questions[i]["paragraph_info"]['id']
    rank = similarity_rank_calculation(query_embedding, paragraphs_embeddings, ground_truth_index)
    ranks.append(rank)

100%|██████████| 3005/3005 [00:08<00:00, 375.34it/s]


In [5]:
ranks = np.array(ranks)
print("mean rank: ", np.mean(ranks))

mean rank:  1.2728785357737105


In [6]:
#hit@1-10
hit1 = np.sum(ranks == 0) / len(ranks)
hit2 = np.sum(ranks < 2) / len(ranks)
hit3 = np.sum(ranks < 3) / len(ranks)
hit5 = np.sum(ranks < 5) / len(ranks)
hit10 = np.sum(ranks < 10) / len(ranks)
print("hit@1: ", hit1)
print("hit@2: ", hit2)
print("hit@3: ", hit3)
print("hit@5: ", hit5)
print("hit@10: ", hit10)

hit@1:  0.7923460898502496
hit@2:  0.8905158069883528
hit@3:  0.9224625623960067
hit@5:  0.9544093178036606
hit@10:  0.9787021630615641


In [7]:
questions[0]

{'paragraph_info': {'id': 1, 'sub': ['Biology', 1]},
 'type': 'simple_direct',
 'updated_question': {'question': 'Who initially recorded the scientific method?',
  'updated': 'Charles Darwin',
  'random1': 'Sir Francis Bacon',
  'random2': 'Isaac Newton',
  'random3': 'Galileo Galilei'},
 'updated_paragraph': 'The Scientific Method\n\nBiologists study the living world by posing questions about it and seeking science-based responses. Known as scientific method, this approach is common to other sciences as well. The scientific method was used even in ancient times, but England\'s Charles Darwin (1809–1882) first documented it. He set up inductive methods for scientific inquiry while developing his theory of evolution. The scientific method is not used only by biologists; researchers from almost all fields of study can apply it as a logical, rational problem-solving method. Darwin\'s contribution to formalizing the scientific method was as significant as his work on natural selection, set

In [8]:
paragraphs[0]

{'id': 0,
 'sub': ['Biology', 0],
 'len': 1099,
 'text': 'The Process of Science\nBiology is a science, but what exactly is science? What does the study of biology share with other scientific disciplines? We can define science (from the Latin scientia, meaning "knowledge") as knowledge that covers general truths or the operation of general laws, especially when acquired and tested by the scientific method. It becomes clear from this definition that applying scientific method plays a major role in science. The scientific method is a method of research with defined steps that include experiments and careful observation. We will examine scientific method steps in detail later, but one of the most important aspects of this method is the testing of hypotheses by means of repeatable experiments. A hypothesis is a suggested explanation for an event, which one can test. Although using the scientific method is inherent to science, it is inadequate in determining what science is. This is because

In [9]:
#hit@1 by questions[i]['type']
ranks = {
    'simple_direct':[],
    'multihop_direct':[],
    'multihop_distant':[],
    'multihop_implicit':[],
    'distant_implicit':[]
}
for i in tqdm(range(len(questions))):
    query_embedding = questions_embeddings[i]
    ground_truth_index = questions[i]["paragraph_info"]['id']
    rank = similarity_rank_calculation(query_embedding, paragraphs_embeddings, ground_truth_index)
    ranks[questions[i]['type']].append(rank)

for key in ranks.keys():
    ranks[key] = np.array(ranks[key])
    print(key)
    print("mean rank: ", np.mean(ranks[key]))
    hit1 = np.sum(ranks[key] == 0) / len(ranks[key])
    print("hit@1: ", hit1)
    print("\n")

100%|██████████| 3005/3005 [00:07<00:00, 379.65it/s]

simple_direct
mean rank:  0.5868852459016394
hit@1:  0.8229508196721311


multihop_direct
mean rank:  0.9847328244274809
hit@1:  0.8076335877862595


multihop_distant
mean rank:  1.2189265536723164
hit@1:  0.8008474576271186


multihop_implicit
mean rank:  1.26378896882494
hit@1:  0.8009592326139089


distant_implicit
mean rank:  2.3284552845528457
hit@1:  0.7300813008130081







In [10]:
#hit@1 by subject
ranks = {
    'Biology':[],
    'Chemistry':[],
    'Geology':[],
    'History':[],
    'Physics':[]
}
for i in tqdm(range(len(questions))):
    query_embedding = questions_embeddings[i]
    ground_truth_index = questions[i]["paragraph_info"]['id']
    rank = similarity_rank_calculation(query_embedding, paragraphs_embeddings, ground_truth_index)
    ranks[questions[i]['paragraph_info']['sub'][0]].append(rank)

for key in ranks.keys():
    ranks[key] = np.array(ranks[key])
    print(key)
    print("mean rank: ", np.mean(ranks[key]))
    hit1 = np.sum(ranks[key] == 0) / len(ranks[key])
    print("hit@1: ", hit1)
    print("\n")

100%|██████████| 3005/3005 [00:08<00:00, 369.62it/s]

Biology
mean rank:  1.2061855670103092
hit@1:  0.7949599083619702


Chemistry
mean rank:  0.4783861671469741
hit@1:  0.8443804034582133


Geology
mean rank:  3.0478260869565217
hit@1:  0.7275362318840579


History
mean rank:  0.4730195177956372
hit@1:  0.8128587830080367


Physics
mean rank:  0.40625
hit@1:  0.8214285714285714





