# Setup

## Install packages

In [None]:
!pip install -U sentence-transformers rank_bm25 faiss-gpu datasets

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting nltk (from sentence-transformers)
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m110.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=7098126c2500b74168744c3eccb4b9770b03c394c86508e8263444b89fa0c41c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
[33mDEPRECATION: pytorch-lightning 1.7.7 has a non-standard dependency specifier torch>=1.9.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: rouge_score
Successfully installed r

## Mount your Google drive in order to save data

In [None]:
from google.colab import drive
from pathlib import Path
import os
drive.mount('/content/drive')





ModuleNotFoundError: No module named 'google.colab'

In [None]:
!mkdir -p drive/MyDrive/new_ai_agents/hw3
!mkdir -p drive/MyDrive/new_ai_agents/hw3/.cache

In [None]:
from pathlib import Path
import os

In [None]:
os.chdir("drive/MyDrive/new_ai_agents/hw3")

## Download a small corpus of Wikipedia articles and split it into snippets

We use a corpus used by the SentenceTransformers author. This cell constructs a list, `passages`, containined `(title, snippet)` tuples.

In [None]:
import json
import gzip
import torch
from sentence_transformers import util


if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")


wikipedia_filepath = 'simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipedia_filepath):
    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)


passages = []
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        data = json.loads(line.strip())
        for paragraph in data['paragraphs']:
            # We encode the passages as [title, text]
            passages.append(dict(title=data['title'], passage=paragraph))

  0%|          | 0.00/50.2M [00:00<?, ?B/s]

In [None]:
from datasets import Dataset

passages = Dataset.from_list(passages)
type(passages)

datasets.arrow_dataset.Dataset

# Problem 3

## **3.1**: Build a BM25 Search Index

Construct a search index that, given a query span, returns a top-n list of support passages.

In [None]:
from typing import List,Tuple
import datasets
## Here is a base class that you should use for both sparse and dense retrieval

class RetrievalIndex:

  def __init__(self, corpus: datasets.arrow_dataset.Dataset):
    self.corpus = corpus

  def __getitem__(self, item):
    return [i for i in self.corpus.select([item])][0]

  @classmethod
  def build_index(cls, corpus, **kwargs):
    """
    Class method that constructs a retrieval index from the corpus
    """
    return cls(corpus, **kwargs)


  def lookup(self, query_strs: List[str], topk = 5) -> List[List[Tuple[str, str, float]]]:
    """
    Accepts a list of query strings and returns a list of lists of (title, passage, score) tuples
    """
    raise NotImplementedError()

In [None]:
from rank_bm25 import BM25Okapi
import numpy as np
class BM25RetrievalIndex(RetrievalIndex):
    def __init__(self, corpus):
        super().__init__(corpus)

        # Tokenization using 'passage' instead of 'text'
        self.tokenizedCorpus = [doc['passage'].split(" ") for doc in corpus]
        # Initialize BM25
        self.bm25 = BM25Okapi(self.tokenizedCorpus)

    def lookup(self, query_strs: List[str], topk=5) -> List[List[Tuple[str, str, float]]]:
        results = []
        for query in query_strs:
            queryTokens = query.split(" ")
            docScores = self.bm25.get_scores(queryTokens)
            topIndices = np.argsort(docScores)[::-1][:topk]
            topIndices = [int(i) for i in topIndices]
            topDocs = [(self.corpus[i]['title'], self.corpus[i]['passage'], docScores[i]) for i in topIndices]
            results.append(topDocs)
        return results





In [None]:
bm25_index = BM25RetrievalIndex.build_index(passages)

In [None]:
bm25_index.lookup(["why do birds fly in a v formation?"])

[[('Collective animal behaviour',
   '3. "Easier movement": Groups of animals moving together (such as fish or birds) save energy. Many of the larger birds fly in flocks. Flying in flocks helps in reducing the energy needed. Many large birds fly in a V-formation, which helps individuals save 12–20 % of the energy they would need to fly alone. Red Knots "Calidris canutus" and Dunlins "Calidris alpina" were found in radar studies to fly 5\xa0km per hour faster in flocks than when they were flying alone.',
   24.367091203145385),
  ('Migration',
   'Many birds fly to warmer places for the winter, as do some insects such as the migratory locust. Young Atlantic salmon leave the river of their birth when they have reached a few inches (cm) in size.',
   24.09441819612898),
  ('Imprinting',
   'The filial imprinting of birds was a primary technique used to create the movie "Winged Migration (Le Peuple Migrateur)", which contains footage of migratory birds in flight. The birds imprinted on han

In [None]:
bm25_index.lookup(["How come some iOS apps have a 'Small bug fix' update for 200MB, yet apps like Minecraft can add in loads of content for 120MB"])

[[('IOS 7',
   'iOS 7 provides full multitasking for all apps. That means that apps will update their content by themselves and instead of showing the app icon, it shows a screenshot of what the app looks like.',
   55.32580835520765),
  ('IOS 6',
   'Unlike past iOS versions, two apps that were removed as part of iOS 6 were Google Maps and YouTube. These apps can be downloaded for free in the App Store.',
   53.79431741742596),
  ('App Store (iOS)',
   'Apps in the App Store are made mainly for iOS devices, such as iPhones and iPads. Some apps can be downloaded for free while others have a cost. Some have "in-app purchases", which are purchases made within the app. These are sometimes done to remove ads or to add features to an app. 30% of all profit made through apps goes to Apple, while 70% goes to whoever made the app.',
   46.98909328471846),
  ('IOS 7',
   'The App Store introduces a new feature called Popular Near Me, which allows the user to find apps that are popular in their 

In [None]:
bm25_index.lookup(["How do computers learn to 'read' programming languages?"])

[[('Programming language',
   'A programming language is a type of written language that tells computers what to do in order to work. Programming languages are used to make all the computer programs and computer software. A programming language is like a set of instructions that the computer follows to do something.',
   21.871736461317653),
  ('Computer security',
   'Computer security involves telling computers what they are "not to do". This makes computer security unique because most programming makes computers "do" things. Security takes much of a computer\'s power.',
   18.926725816176187),
  ('Wheelspin', 'How to do a Wheelspin in a manual car', 18.682881978202946),
  ('High-level programming language',
   'What determines a programming language as "high-level" depends on how much the programming language relies on hardware knowledge. Programming languages that require less knowledge about computer hardware are called "high-level" programming languages. It doesn\'t matter whethe

In [None]:
bm25_index.lookup(["Why are things 'catchy', you know like a beat or something?"])

[[('Harry Potter (character)',
   'Rowling has also maintained that Harry is a suitable real-life role model for children. "The advantage of a fictional hero or heroine is that you can know them better than you can know a living hero, many of whom you would never meet […] if people like Harry and identify with him, I am pleased, because I think he is very likeable."',
   23.683492164935522),
  ('The Daughter of Sláva',
   '"But doves you know are come of such a stock"',
   23.37241742580187),
  ('Grammar',
   "Nouns are 'thing' words like 'table and 'chair'. They are objects, things you see in everyday life. Proper nouns are names of specific places, people, or other things like days of the week. The name 'James' is a proper noun, as is 'Wednesday' and 'London'. Nouns can also be abstract things, such as 'suffering' or 'happiness'.",
   23.22809380231561),
  ('Computer',
   'There are bigger computers that many people at a time can use. These are called "Mainframes," and these computer

In [None]:
bm25_index.lookup([   "Why does Adderall help you do better on tests? I have a friend that pops an Adderall pill before every one of his tests, he doesn't study, doesn't pay attention in class, and isn't a genius so how does he always do well? I know it makes you focus better but, it can't help teach you new information, can it?"])

[[('Harry Potter (character)',
   'Rowling has also maintained that Harry is a suitable real-life role model for children. "The advantage of a fictional hero or heroine is that you can know them better than you can know a living hero, many of whom you would never meet […] if people like Harry and identify with him, I am pleased, because I think he is very likeable."',
   75.22189265781878),
  ('Josh Peck',
   'In the fourth season of "Drake & Josh", Peck was noticeably thinner. He said: I made a conscious effort to lose weight because I knew I could be happier as well as being healthier. I started by going on a diet a year and a half ago and I got a personal trainer, but I definitely have a healthier lifestyle now. Also I feel that because I do so much television, I am a better role model. I don\'t really understand why I should be a role model, but I know that kids do look up to me, so it is my responsibility to motivate people and be inspiring. I hope that I can do that for kids. It 

In [None]:
bm25_index.lookup([     "Why are things that used to be funny, not funny anymore? Why is it that they don't make movies like The Mask anymore? It seems like things are funny for a while, and then they're not. The Mask was a huge hit in the 90s, but I think if a movie was made today in that similar style, it wouldn't work. Why is that? It was funny then- why isn't it funny now? Another example is The Internship (released 2013). The Onion made fun of it, saying it was 'poised to be biggest comedy of 2005.' Why is certain humor stuck in a particular time period? Thanks!"
])

[[('The Proud Boys',
   'Enrique Tarrio said thought the hacking was funny but did not like it for other reasons: "One of the messages they want to send with this is that they\'re trying to drown out our supporters, they\'re trying to silence us. ... When you\'re trying to drown out other people\'s thoughts, I don\'t think there\'s anything progressive about that. Why don\'t these people just engage?"',
   118.66141742681525),
  ('Star Trek',
   'The movie "Galaxy Quest" is a "Star Trek" parody, which means it was made to be like "Star Trek" in a funny way.',
   117.67964508534713),
  ('Lolcat',
   'A lolcat is a common Internet meme and society which is made up of picture of a cat with a funny caption. The caption is usually in bad English that makes fun of Internet slang. like chez and haz. "Lolcat" is a mix of the words lol and cat. Lol is internet slang for "laughing out loud." A different word for lolcat is "cat macro" because it is a type of image macro. The first time the media 

In [None]:
bm25_index.lookup(["How do planes fly"])

[[('George Bush Intercontinental Airport',
   'The airport has five runways. At least 700 planes takeoff from the airport daily. These planes carried 40,007,345 passengers during the year of 2009. Planes fly from this airport to around 170 cities in the world.',
   18.02811302616688),
  ('Wheelspin', 'How to do a Wheelspin in a manual car', 17.970453196075812),
  ('Statistical survey',
   'How many minutes, on average, do you spend eating breakfast?',
   17.6802180271551),
  ('Toxic metals',
   'How do you know if you have heavy metals in your body?',
   17.126992625057525),
  ('Battle of Iwo Jima',
   'Japan could not build new airplanes until March or April 1945. Even then, these planes could not fly to Iwo Jima from Japan. Japan did mot have enough pilots and other aircrew.',
   15.602774720927568)]]

## **3.2**: Building a Dense Retrieval Index

In [None]:
import faiss
from sentence_transformers import SentenceTransformer, CrossEncoder, util


class DenseRetrievalIndex(RetrievalIndex):
  def __init__(self, corpus: List[Tuple[str,str]], precomputed_index : str =None):
    """
    compute the embeddings for each passage in the wiki corpus, then feed them
    to the `add_faiss_index` builtin function from HuggingFace's Dataset class
    https://huggingface.co/docs/datasets/v1.2.1/faiss_and_ea.html

    (Optional but recommended) if the filepath argument `precomputed_index` is not None,
    then this should not compute the embeddings but rather call load_faiss_index on the path

    """
    super().__init__(corpus)
    self.encoder = SentenceTransformer('msmarco-MiniLM-L-6-v3')
    #print(f'corpus: {corpus["passage"][1]}')
    if precomputed_index is not None:
      self.corpus.load_faiss_index('embeddings', precomputed_index)
    else:
      self.corpus = corpus.map(
          lambda examples: {
              'embeddings':
                self.encoder.encode(
                    [title + " " + passage
                     for title, passage
                     in zip(examples['title'], examples['passage'])])}, batched=True, batch_size=512)
      self.corpus.add_faiss_index(column='embeddings')


  def save(self, file):
    """
    (Optional but recommended) helper that saves the index to a file using `save_faiss_index`
    """
    self.corpus.save_faiss_index('embeddings', file)

  def lookup(self, query_strs, topk=5):
    results = []

    question_embeddings = self.encoder.encode(query_strs, batch_size=128, show_progress_bar=True)
    question_scores, question_top_docs = self.corpus.get_nearest_examples_batch(
        'embeddings',
        question_embeddings,
        k=topk
    )
    for scores, top_docs in zip(question_scores, question_top_docs):
      result = []
      # question_embedding = self.encoder.encode(query)
      # scores, top_docs = self.corpus.get_nearest_examples('embeddings', question_embedding, k=topk)

      for score, title, passage in zip(scores, top_docs['title'], top_docs['passage']):
        result.append((title, passage, score))
      results.append(result)

    return results

  @classmethod
  def from_file(cls, corpus, file):
      """
      (Optional but recommended) helper that loads the index from the specified filepath
      """
      assert os.path.exists(file)
      return cls(corpus=corpus, precomputed_index=file)

In [None]:
dense_index = DenseRetrievalIndex.build_index(passages)
dense_index.save("msmarco_sbert.faiss")

## Uncomment this line and initialize this way if you have already computed and saved the index
# dense_index = DenseRetrievalIndex.from_file(passages, 'msmarco_sbert.faiss')


.gitattributes:   0%|          | 0.00/736 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Map:   0%|          | 0/509663 [00:00<?, ? examples/s]

  0%|          | 0/510 [00:00<?, ?it/s]

In [None]:
dense_index.lookup(["why do flocks of birds fly in a v formation?"], topk=5)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[('Bird',
   'Flocks of birds can be very highly organised in a way that takes care of all the flock members. Studies of small flocking birds like tree sparrows show that they clearly communicate with each other, as sometimes thousands of birds may fly in close formation and spiral patterns without colliding (or flying into each other).',
   40.331062),
  ('Flightless birds',
   'Despite this general picture, there have been birds who lost the power of flight soon after flight evolved. The first fossil flightless birds occurred in the Cretaceous period. It has long been recognised that there are circumstances where it is definitely not a good thing to have wings. The connection between oceanic islands and flightlessness was known to Darwin. The explanation is that, first, oceanic islands have few predators. Second, that storms occur which may blow winged birds right off the island so far that they cannot find they way back. Once they lose flight, the land birds can evolve to be larger

In [None]:
dense_index.lookup(["How come some iOS apps have a 'Small bug fix' update for 200MB, yet apps like Minecraft can add in loads of content for 120MB"])



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[('Jitsi',
   "On February 1, 2015, Hristo Terezov, Ingo Bauersachs and the rest of the team released version 2.6 from their stand at the Free and Open Source Software Developers' European Meeting 2015 event in Brussels. This release includes security fixes, removes support of the deprecated MSN protocol, along with SSLv3 in XMPP. Among other improvements, the OS X version has aa Java 8 runtime inside, enables echo cancelling by default, and uses the CoreAudio subsystem. The Linux build fixes font issues with the GTK+ native LookAndFeel, and fixes someissues about microphone volume level on call starting when using the PulseAudio sound system. A full list of changes is available on the project web site.",
   37.857033),
  ('Damn Small Linux',
   'DSL has built-in scripts to download and install Advanced Packaging Tool (APT). Once APT is enabled, the user can install packages from Debian\'s \'Woody\' repository. Additionally, DSL hosts software ranging from large applications like Open

In [None]:
dense_index.lookup(["How do computers learn to 'read' programming languages?"])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[('Computer programming',
   'Computers understand instructions directly if those instructions are written in machine code, special characters that can be processed by the computer but are difficult for humans to read. Writing these instructions directly in machine code takes a long time and is difficult, so instructions are written in a language easier for people to use, which the computer converts into "computer form" instructions (in other words, machine code) so the computer can follow them. The simplest of these is assembly language. Programs written in a language more like English are converted by a compiler. Some languages, called interpreted languages, use interpreters instead of assemblers or compilers.',
   40.441147),
  ('Computer',
   "Computer programs are designed or written by computer programmers. A few programmers write programs in the computer's own language called machine code. Most programs are written using a programming language like C, C++, Java. These programmi

In [None]:
dense_index.lookup(["Why are things 'catchy', you know like a beat or something?"])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[("Schrödinger's cat",
   'Physics can be divided into two types; classic physics and quantum mechanics. Classic physics explains most physical interactions, like why a ball bounces when it drops. It can also be used to predict physical interactions, like what will happen when you drop a ball. However, there are some physical interactions which it does not explain; for instance, how light can be turned into electricity. Quantum mechanics provides a way for physicists to explain why these things happen.',
   46.73451),
  ('Mean Girls',
   "However, Cady gets too much into the Plastics, and throws a party at her parents' house without their permission. She tries to hook up with Regina's boyfriend, but he does not like her because she's just like a Plastic now. Janice tells Cady that she's just one of the Plastics now. Low grades in calculus class force Cady to rethink her plans. She joins the decathlon team.",
   47.28181),
  ('Poker',
   'The game and jargon of poker have become import

In [None]:
dense_index.lookup([   "Why does Adderall help you do better on tests? I have a friend that pops an Adderall pill before every one of his tests, he doesn't study, doesn't pay attention in class, and isn't a genius so how does he always do well? I know it makes you focus better but, it can't help teach you new information, can it?"])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[('Test',
   'However, academic tests are not perfect measures. Tests could only partly measure a student’s memory and maybe their understanding. The test would only be about a small part of the subject, and only at that moment in time. Measurement can vary considerably and randomly based on questions being asked.',
   33.22331),
  ('Intelligence quotient',
   'However, as explained above, IQ tests were highly successful in assessing recruits during wartime. Therefore, "it must be true that they are measuring a relevant mental capability". Therefore, IQs are not simply a mathematical fiction: they relate to the ability of individuals to perform certain functions. Even if experts do not agree on a definition of intelligence, that does not disprove the usefulness (or otherwise) of the tests. In every day life people do notice the relative intelligence of others. The issue is central to human nature and evolutionary psychology, because humans evolved the characteristics which helped them

In [None]:
dense_index.lookup([     "Why are things that used to be funny, not funny anymore? Why is it that they don't make movies like The Mask anymore? It seems like things are funny for a while, and then they're not. The Mask was a huge hit in the 90s, but I think if a movie was made today in that similar style, it wouldn't work. Why is that? It was funny then- why isn't it funny now? Another example is The Internship (released 2013). The Onion made fun of it, saying it was 'poised to be biggest comedy of 2005.' Why is certain humor stuck in a particular time period? Thanks!"
])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[('Pinocchio (1940 movie)',
   'Nevertheless, there were positive reactions to the movie as well. Archer Winsten, who had criticized "Snow White", said that "The faults (mistakes) that were in "Snow White" no longer exist. In writing of "Pinocchio", you are limited only by your own power of expressing enthusiasm". Also, despite the poor timing of the release, the movie did do well both critically and at the box office in the United States. Jiminy Cricket\'s song, "When You Wish Upon a Star," became a major success and still is today, and is the fanfare for The Walt Disney Company. "Pinocchio" also won the Academy Award for Best Song and the Academy Award for Best Scoring of a Musical Picture. In 1994, "Pinocchio" was added to the United States National FilmRegistry as being very important in culture, history, or aesthetic. In 2001 Terry Gilliam picked it as one of the ten best animated movies of all time and in 2005 Time.com named it one of the 100 best movies of the last 80 years. Ma

In [None]:
dense_index.lookup(["How do planes fly"])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[('Flight',
   'Flight or flying is a process of movement of the object in a gaseous environment or a vacuum without contact with the surface of another environment, as is possible with the use of jet thrust, or other means of propulsion, and without it (by inertia). Best adapted for long controlled powered flight are flying birds and insects, and mammals of the order Chiroptera and extinct reptiles of the order pterosaurs. All of these animals use the aerodynamic principles of flight, using the load-bearing properties of the wing. There are also many animals that make gliding and parachute flight, such as spiders web, aeroplankton flying clams and flying fish, intending to use the ear or leather folds, mammals and reptiles. The fastest flying animal is the peregrine falcon. The speed of its dive is over .',
   38.110172),
  ('Aircraft',
   'Big aeroplanes for transporting people are called "airliners". Airliners are the quickest way to travel. Airliners can fly over mountains and bad

## **3.3** Using a Reranking Cross Encoder

In [None]:
from sentence_transformers import CrossEncoder
from tqdm.auto import tqdm


class RerankingDenseRetrievalIndex(DenseRetrievalIndex):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

  def lookup(self, query_strs, topk=5, initial_topk=50):
    """
    retrieve `initial_topk` candidates as in `DenseRetrievalIndex` class, but
    then rerank them according to scores of `self.cross_encoder`
    """
    initial_topk = super().lookup(query_strs, initial_topk)

    results = []
    for query, q_initial in tqdm(zip(query_strs, initial_topk), total=len(query_strs)):
      result = []
      doc_scores = self.cross_encoder.predict([(query, title + " " + passage) for title, passage, score in q_initial])
      ind = np.argpartition(doc_scores, -topk)[-topk:]
      top_scores = doc_scores[ind]
      top_docs = [q_initial[i] for i in ind]

      for score, (title, passage, _) in sorted(zip(top_scores, top_docs), key=lambda x: x[0], reverse=True):
        result.append((title, passage, score))
      results.append(result)
    return results

In [None]:

ranking_index = RerankingDenseRetrievalIndex.from_file(passages, 'msmarco_sbert.faiss')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
ranking_index.lookup(["why is the sky blue?", "why do flocks of birds fly in a v formation?"], topk=5)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

[[('Sky',
   'The sky, which is made up of gas molecules, is blue because of the random scattering of sunlight by the molecules. Rayleigh scattering defines the amount of scattering of light rays. Blue light scatters much more than red, which is why the sky appears blue on a clear day. Depending on the time of day, the sky may appear different colors. At dawn or dusk the sky may appear red, orange, or even green and purple depending on how low the sun is and how close it is to night.',
   9.079622),
  ('Moon',
   'In the Earth, the sky is blue because the blue rays of the sun bounce off the gases in the atmosphere, making it look like blue light is coming from the sky. But on the moon, because there is no atmosphere, the sky looks black, even in the daytime. There is no atmosphere to protect the moon from the rocks that fall from outer space, and these meteorites crash right into the moon and make wide, shallow holes called craters. The moon has thousands of them. Newer craters gradual

In [None]:
ranking_index.lookup(["How come some iOS apps have a 'Small bug fix' update for 200MB, yet apps like Minecraft can add in loads of content for 120MB"])



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[[('Software versioning',
   'Many times, after a program has been made, things called "patches" can be downloaded from the program\'s website. The patches usually make small updates or fixes to the main program, such as fixing bugs or updating information or program functions. When a patch is used, the program is advanced to the next version, advancing the version number by the appropriate level.',
   -5.338006),
  ('IOS 11',
   'Some of iOS 11\'s changes: the lock screen and Notification Center are combined, allowing all notifications to be displayed directly on the lock screen. The various pages of the Control Center are unified, gaining custom settings and the ability to 3D Touch icons for more options. The App Store receives a big visual change to focus on editorial content and daily highlights. A "Files" file manager app allows direct access to files stored locally and in cloud services. Siri can now translate between languages and use a privacy-minded "on-device learning" techni

In [None]:
ranking_index.lookup(["How do computers learn to 'read' programming languages?"])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[[('Computer programming',
   'Computers understand instructions directly if those instructions are written in machine code, special characters that can be processed by the computer but are difficult for humans to read. Writing these instructions directly in machine code takes a long time and is difficult, so instructions are written in a language easier for people to use, which the computer converts into "computer form" instructions (in other words, machine code) so the computer can follow them. The simplest of these is assembly language. Programs written in a language more like English are converted by a compiler. Some languages, called interpreted languages, use interpreters instead of assemblers or compilers.',
   5.178899),
  ('Programming language',
   'A programmer writes source code text in the programming language to create programs. Usually, the programming language uses real words for some of the commands, so that the language is easier for a human to read. Many programming 

In [None]:
ranking_index.lookup(["Why are things 'catchy', you know like a beat or something?"])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[[('Yummy (song)',
   'Rania Aniftos of "Billboard" magazine said "Yummy" brings back "the flirtatious Bieber we\'ve missed and been waiting for", describing the chorus as "catchy". Bryan Rolli, writing for "Forbes", said that Bieber "sings his heart out" on the song and noted although "the lyrics may not invite scholarly analysis ... Bieber sure does sound good singing them". Rolli concluded calling the song a "win, an inevitable chart smash" and opined that it is "sure to sound even better when 50,000 fans scream it every night on his upcoming tour". Mikael Wood of "Los Angeles Times" described the song as "a lithe little R&B number that faintly recalls Ginuwine\'s mid-\'90s classic \'Pony\' and basically three-and-a-half minutes of PG-13 sex talk seemingly directed at Hailey Baldwin". He stated that "though it\'s very cute, \'Yummy\' feels awfully lightweight for a single that has as much hanging on it as this one does" and added that the song "loses much of its flavor after only a 

In [None]:
ranking_index.lookup([   "Why does Adderall help you do better on tests? I have a friend that pops an Adderall pill before every one of his tests, he doesn't study, doesn't pay attention in class, and isn't a genius so how does he always do well? I know it makes you focus better but, it can't help teach you new information, can it?"])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[[('Anaphylaxis',
   'If a person has a bad reaction to an insect sting or a medication, blood tests for tryptase or histamine might be useful in diagnosing anaphylaxis. However, these tests are not very useful if the cause is food, or if the person has a normal blood pressure. Also, these tests cannot say for sure that a person does not have anaphylaxis.',
   -8.131294),
  ('Ibuprofen',
   'If you are taking this drug "as needed" (not on a regular schedule), remember that pain medications work best if they are used as the first signs of pain occur. If you wait until the pain has worsened, the medication may not work as well.',
   -8.234266),
  ('Depression (mental illness)',
   'If people with depression do not take their medicine the right way, the depression can get worse. A doctor must help when they want to change to another medication, or to take a different amount of a medication than before.',
   -8.324897),
  ('Dyslexia',
   'In order to tell if a child has dyslexia, he or she

In [None]:
ranking_index.lookup([     "Why are things that used to be funny, not funny anymore? Why is it that they don't make movies like The Mask anymore? It seems like things are funny for a while, and then they're not. The Mask was a huge hit in the 90s, but I think if a movie was made today in that similar style, it wouldn't work. Why is that? It was funny then- why isn't it funny now? Another example is The Internship (released 2013). The Onion made fun of it, saying it was 'poised to be biggest comedy of 2005.' Why is certain humor stuck in a particular time period? Thanks!"
])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[[('The Mask: The Animated Series',
   'The cartoon, despite having a "realistic" setting in the fictional town of Edge City, would often rely more on Tex Avery-style humor and on occasion, broke any meaningful laws of reality - one episode featured the Goofalotatots, parodies of the Animaniacs, treating them as if they were naturally alive. Another featured the Mask becoming a personal assistant to the President of the US, with the job merely handed to him (the president was a caricature of no real president - it should also be noted that former Mask comic writer John Arcudi wrote both example episodes, a stark departure from his usual writing). Police officers were portrayed as idiots who could not see even obvious clues.',
   0.3692649),
  ('The Mask: The Animated Series',
   "For villains as for the Mask's transformation, there was often DC Comics parodies, as Supermask and Super Salad Man (a parody of Superman). Some Marvel references were made too, like The Mask becoming Biclops 

In [None]:
ranking_index.lookup(["How do planes fly"])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[[('Aerodynamics',
   'For aircraft, lift (force) is the force pushing up. Gravity is the force pushing down. If lift is stronger than gravity the plane goes up. If gravity is stronger than lift, the plane goes down. In planes, lift is caused by airflow. Airplane wings are designed in such a way that, the air molecules prefer moving under the wing rather than above it, so as a result there is more air flowing underneath the wing compared to above it. This motion of the air gives upward lift to the aircraft. "See Bernoulli\'s principle."',
   5.645568),
  ('Aircraft',
   'Some aircraft keep in the sky by moving air over their wings. Examples are aeroplanes, helicopters, and gliders. Some aircraft keep in the sky by floating. Examples are balloons and airships.',
   5.2357535),
  ('Aircraft carrier',
   'The flight deck of a large, modern aircraft carrier has a landing area and a take-off area. The landing area is in the back and has a short runway. Airplanes stop by using a hook on the 

# 4

In [None]:
from transformers import AutoTokenizer, BartForConditionalGeneration

qar_tokenizer = AutoTokenizer.from_pretrained('yjernite/bart_eli5')
qar_model = BartForConditionalGeneration.from_pretrained('yjernite/bart_eli5')



config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset,load_metric

raw_datasets = load_dataset("eli5")
val = raw_datasets['validation_eli5']


Downloading builder script:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/576M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/21.1M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/286M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/330M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/18.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/36.2M [00:00<?, ?B/s]

In [None]:
queries = [doc + " " + text for doc,text in zip(val["title"], val["selftext"])]
labels = [answer["text"][0] for answer in val["answers"]]
rel_results = ranking_index.lookup(queries, topk=1)
rel_results = [q_result[0][1] for q_result in rel_results]
inputs = [rel + " " + query for query, rel in zip(queries, rel_results)]

Batches:   0%|          | 0/77 [00:00<?, ?it/s]

  0%|          | 0/9812 [00:00<?, ?it/s]

In [None]:
val = val.add_column("inputs", inputs)
val = val.add_column("labels", labels)

In [None]:
from transformers import pipeline
text2text_generator = pipeline(
    "text2text-generation" , model = qar_model , tokenizer = qar_tokenizer, batch_size = 32 , device = "cuda:0")

In [None]:
from tqdm.notebook import tqdm
def data_iterator():
  for i_x in tqdm(val["inputs"]):
    yield i_x

results = text2text_generator(data_iterator(), truncation = True , max_length = 512)
res_array = []
for out in results:
  res_array.extend(out)

  0%|          | 0/9812 [00:00<?, ?it/s]

In [None]:
rouge = load_metric("rouge")
preds = [predictions["generated_text"] for predictions in res_array]
refs = val["labels"]
rouge_score = rouge.compute(predictions=preds, references=refs)
print(rouge_score)


  rouge = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

{'rouge1': AggregateScore(low=Score(precision=0.2738433341163382, recall=0.12357527274616487, fmeasure=0.12338890220129814), mid=Score(precision=0.27832657777457037, recall=0.12517122259684543, fmeasure=0.12506914635056837), high=Score(precision=0.28250239772096813, recall=0.12678282053531273, fmeasure=0.12669779163588923)), 'rouge2': AggregateScore(low=Score(precision=0.045979171108486275, recall=0.017343132298929927, fmeasure=0.018341081585515673), mid=Score(precision=0.047432489868083856, recall=0.017852966228043533, fmeasure=0.018843032298870334), high=Score(precision=0.04899521084135674, recall=0.018421574353266194, fmeasure=0.019417837673506827)), 'rougeL': AggregateScore(low=Score(precision=0.2031456150801572, recall=0.0968848684440801, fmeasure=0.09255376610945816), mid=Score(precision=0.20608158675757185, recall=0.09813305284814115, fmeasure=0.09367966501847907), high=Score(precision=0.20925001102972005, recall=0.09944360211793808, fmeasure=0.09479024541516179)), 'rougeLsum': 

# 5

In [None]:
questions = [
    "How come some iOS apps have a 'Small bug fix' update for 200MB, yet apps like Minecraft can add in loads of content for 120MB",
    "How do computers learn to 'read' programming languages?",
    "Why are things 'catchy', you know like a beat or something?",
    "Why does Adderall help you do better on tests? I have a friend that pops an Adderall pill before every one of his tests, he doesn't study, doesn't pay attention in class, and isn't a genius so how does he always do well? I know it makes you focus better but, it can't help teach you new information, can it?",
    "Why are things that used to be funny, not funny anymore? Why is it that they don't make movies like The Mask anymore? It seems like things are funny for a while, and then they're not. The Mask was a huge hit in the 90s, but I think if a movie was made today in that similar style, it wouldn't work. Why is that? It was funny then- why isn't it funny now? Another example is The Internship (released 2013). The Onion made fun of it, saying it was 'poised to be biggest comedy of 2005.' Why is certain humor stuck in a particular time period? Thanks!"
]

bm25_results = bm25_index.lookup(questions, topk=1)
bm25_inputs = [result[0][1] + " " + question for question, result in zip(questions, bm25_results)]


In [None]:
print(bm25_inputs)

["iOS 7 provides full multitasking for all apps. That means that apps will update their content by themselves and instead of showing the app icon, it shows a screenshot of what the app looks like. How come some iOS apps have a 'Small bug fix' update for 200MB, yet apps like Minecraft can add in loads of content for 120MB", "A programming language is a type of written language that tells computers what to do in order to work. Programming languages are used to make all the computer programs and computer software. A programming language is like a set of instructions that the computer follows to do something. How do computers learn to 'read' programming languages?", 'Rowling has also maintained that Harry is a suitable real-life role model for children. "The advantage of a fictional hero or heroine is that you can know them better than you can know a living hero, many of whom you would never meet […] if people like Harry and identify with him, I am pleased, because I think he is very likea

In [None]:
dense_results = dense_index.lookup(questions, topk=1)
dense_inputs = [result[0][1] + " " + question for question, result in zip(questions, dense_results)]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
print(dense_inputs)

["On February 1, 2015, Hristo Terezov, Ingo Bauersachs and the rest of the team released version 2.6 from their stand at the Free and Open Source Software Developers' European Meeting 2015 event in Brussels. This release includes security fixes, removes support of the deprecated MSN protocol, along with SSLv3 in XMPP. Among other improvements, the OS X version has aa Java 8 runtime inside, enables echo cancelling by default, and uses the CoreAudio subsystem. The Linux build fixes font issues with the GTK+ native LookAndFeel, and fixes someissues about microphone volume level on call starting when using the PulseAudio sound system. A full list of changes is available on the project web site. How come some iOS apps have a 'Small bug fix' update for 200MB, yet apps like Minecraft can add in loads of content for 120MB", 'Computers understand instructions directly if those instructions are written in machine code, special characters that can be processed by the computer but are difficult fo

In [None]:
reranked_results = ranking_index.lookup(questions, topk=1)
reranked_inputs = [result[0][1] + " " + question for question, result in zip(questions, reranked_results)]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
print(reranked_inputs)

['Many times, after a program has been made, things called "patches" can be downloaded from the program\'s website. The patches usually make small updates or fixes to the main program, such as fixing bugs or updating information or program functions. When a patch is used, the program is advanced to the next version, advancing the version number by the appropriate level. How come some iOS apps have a \'Small bug fix\' update for 200MB, yet apps like Minecraft can add in loads of content for 120MB', 'Computers understand instructions directly if those instructions are written in machine code, special characters that can be processed by the computer but are difficult for humans to read. Writing these instructions directly in machine code takes a long time and is difficult, so instructions are written in a language easier for people to use, which the computer converts into "computer form" instructions (in other words, machine code) so the computer can follow them. The simplest of these is 

In [None]:
answers_bm25 = text2text_generator(bm25_inputs, truncation=True, max_length=512)
print(answers_bm25)



[{'generated_text': ' The app is not updating the content, it is updating the app itself.'}, {'generated_text': ' The computer is a program. It is a set of instructions. The computer follows the instructions. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The computer is a program. The comp

In [None]:
answers_dense= text2text_generator(dense_inputs, truncation=True, max_length=512)
print(answers_dense)

[{'generated_text': ' The difference is in the size of the app. The size of the app is not the same as the size of the app.'}, {'generated_text': " The computer doesn't understand the instructions directly. It understands the language the language is written in. The computer is programmed to interpret the language it is written in."}, {'generated_text': ' Classic physics explains most physical interactions, like why a ball bounces when it drops. Quantum mechanics explains some interactions, like why a light can turn into electricity.'}, {'generated_text': " It's not a good idea to take a drug that makes you focus better. It's a good idea to take a drug that makes you focus better."}, {'generated_text': " I think it's because the internet has changed the definition of what is funny."}]


In [None]:
answers_reranked= text2text_generator(reranked_inputs, truncation=True, max_length=512)
print(answers_reranked)

[{'generated_text': ' The app you download is a "small bug fix" update. The app you download is a "large bug fix" update.'}, {'generated_text': " The computer doesn't understand the instructions directly. It understands the language the language is written in. The computer is programmed to interpret the language it is written in."}, {'generated_text': " It's a catchy song, but it's not a great song. It's a catchy song with a good beat, but it's not a great song."}, {'generated_text': " Adderall is a stimulant. It makes you more alert and helps you focus. It also makes you more likely to pay attention. It's a stimulant."}, {'generated_text': " It's funny when you're not expecting it."}]
