In [None]:
!pip install sentence_transformers
!pip install faiss-gpu

Collecting sentence_transformers
  Obtaining dependency information for sentence_transformers from https://files.pythonhosted.org/packages/06/97/57afa3d05801b6b9305f96a7ce5995e12c1d2ba25ce66747de107816b0b5/sentence_transformers-2.3.1-py3-none-any.whl.metadata
  Downloading sentence_transformers-2.3.1-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-2.3.1
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
import gc
import logging
from tqdm.auto import tqdm

import numpy as np
from datasets import load_dataset
from transformers import pipeline
from sentence_transformers import SentenceTransformer, LoggingHandler
import faiss

In [None]:
if __name__ == "__main__":
    wiki_corpus_paths = '/kaggle/input/wikipedia-sentences/wikisent2.txt'
    processed_wiki_dataset = load_dataset("text", data_files={"train": wiki_corpus_paths}, split='train')
    print(f'The Wikipedia corpus contains {len(processed_wiki_dataset)} sentences.')
    model_name = "all-MiniLM-L6-v2"
    model = SentenceTransformer(model_name)
    embedding_dim = model.get_sentence_embedding_dimension()
    max_seq_len = model.max_seq_length
    print(f'The embedding dimension of the all-MiniLM-L6-v2 model is {embedding_dim}.')
    print(f"Max sequence lenght of the {model_name} model is {max_seq_len}.")
    M = 32
    quantizer = faiss.IndexHNSWFlat(
                        embedding_dim,
                        M,
                        faiss.METRIC_INNER_PRODUCT
                        )
    nlist = 10_000
    nsegment = 16
    nbit = 8
    cpu_index = faiss.IndexIVFPQ(
                        quantizer,
                        embedding_dim,
                        nlist, nsegment, nbit,
                        faiss.METRIC_INNER_PRODUCT
                        )

    gpu_index = faiss.index_cpu_to_all_gpus(
        cpu_index
    )

    pool = model.start_multi_process_pool()
    batch_size = 2**19
    total_batches = len(processed_wiki_dataset['text']) // batch_size + (0 if len(processed_wiki_dataset['text']) % batch_size == 0 else 1)
    for i in tqdm(range(0, len(processed_wiki_dataset['text']), batch_size), total=total_batches, desc="Processing Batches"):
        batch_texts = processed_wiki_dataset['text'][i:i + batch_size]
        batch_embeddings = model.encode_multi_process(
                                    batch_texts,
                                    pool,
                                    normalize_embeddings=True
                                    )
        gpu_index.train(batch_embeddings)
        gpu_index.add(batch_embeddings)
        del batch_embeddings
        gc.collect()
    gpu_index.nprobe = 1_000

    def search_wiki_articles(question):
        question_embedding = model.encode_multi_process(
                                    question,
                                    pool,
                                    normalize_embeddings=True
                                    )
        distances, indices = gpu_index.search(question_embedding, k=5)
        return [processed_wiki_dataset['text'][i] for i in indices[0]]

    questions = [
        'What is the Carnot engine?',
        'What is a virtual particle?',
        'What is Lorentz symmetry or Lorentz invariance in relativistic physics?',
        'What did Newton adopt after his correspondence with Hooke in 1679-1680?',
        'What is the difference between redshift due to the expansion of the universe and Doppler redshift?',
    ]

    relevant_article_chunks = [search_wiki_articles(question) for question in questions]

    print(f'Question:\n{questions[0]}')
    print(f'Relevant Wikipedia article chunks:\n{relevant_article_chunks[0]}')

    model.stop_multi_process_pool(pool)

Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-0c393fc0da9c1b19/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-0c393fc0da9c1b19/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.
The Wikipedia corpus contains 7871825 sentences.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

The embedding dimension of the all-MiniLM-L6-v2 model is 384.
Max sequence lenght of the all-MiniLM-L6-v2 model is 256.




Processing Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Question:
What is the Carnot engine?
Relevant Wikipedia article chunks:
['He received the Wm.', 'W Network (often shortened to W) is a Canadian English language Category A specialty channel, owned by Corus Entertainment.', 'WNWS (1520 AM) is a radio station broadcasting a Soft Adult Contemporary format.', 'WWLW is owned and operated by West Virginia Radio Corporation.', 'WYOU Community Television, Inc (WYOU) is a nonprofit Public, educational, and government access (PEG) cable television station for the Madison, Wisconsin area.']


In [None]:
qa_model_name = 'distilbert-base-cased-distilled-squad'
question_answerer = pipeline('question-answering', model=qa_model_name, tokenizer=qa_model_name)
print(f'The maximum length of tokens the we can feed to the tokenizer before truncation is {question_answerer.tokenizer.model_max_length}.')

def answer_question(question, article_chunks):
    context = ' '.join(article_chunks)
    result = question_answerer(question=question, context=context)
    return result

results = [answer_question(questions[idx], relevant_article_chunks[idx]) for idx in range(len(questions))]

for idx in range(len(questions)):
    print(f'Question:\n{questions[idx]}')
    print(f"Answer:\n'{results[idx]['answer']}',\nscore:{round(results[idx]['score'], 4)}, start:{results[idx]['start']}, end:{results[idx]['end']}")
    print('='*30)

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

The maximum length of tokens the we can feed to the tokenizer before truncation is 512.
Question:
What is the Carnot engine?
Answer:
'WNWS',
score:0.0615, start:144, end:148
Question:
What is a virtual particle?
Answer:
'Soft Adult Contemporary',
score:0.5277, start:193, end:216
Question:
What is Lorentz symmetry or Lorentz invariance in relativistic physics?
Answer:
'Soft Adult Contemporary',
score:0.0478, start:193, end:216
Question:
What did Newton adopt after his correspondence with Hooke in 1679-1680?
Answer:
'Wm. W Network',
score:0.1945, start:16, end:29
Question:
What is the difference between redshift due to the expansion of the universe and Doppler redshift?
Answer:
'Soft Adult Contemporary format',
score:0.078, start:193, end:223


In [None]:
"""Official evaluation script for SQuAD version 2.0.

In addition to basic functionality, we also compute additional statistics and
plot precision-recall curves if an additional na_prob.json file is provided.
This file is expected to map question ID's to the model's predicted probability
that a question is unanswerable.
"""
import argparse
import collections
import json
import numpy as np
import os
import re
import string
import sys

OPTS = None

def parse_args():
    parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
    parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
    parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
    parser.add_argument('--out-file', '-o', metavar='eval.json',
                        help='Write accuracy metrics to file (default is stdout).')
    parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
                        help='Model estimates of probability of no answer.')
    parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
                        help='Predict "" if no-answer probability exceeds this (default = 1.0).')
    parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
                        help='Save precision-recall curves to directory.')
    parser.add_argument('--verbose', '-v', action='store_true')
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    return parser.parse_args()

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()

def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [None]:
squad_dataset = dataset = load_dataset('squad', split='train')

question = squad_dataset['question'][0]
context = squad_dataset['context'][0]
answer = squad_dataset['answers'][0]

result = question_answerer(question=question, context=context)

exact_score = compute_exact(a_gold=answer['text'][0], a_pred=result['answer'])
f1_score = compute_f1(a_gold=answer['text'][0], a_pred=result['answer'])


print(f'Question:\n{question}')
print(f"Predited Answer:\n{result['answer']}")
print(f"Proper answer accoding to SQuAD:\n{answer['text'][0]}")
print(f'F1: {f1_score},\tE: {exact_score}')

Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.
Question:
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Predited Answer:
Saint Bernadette Soubirous
Proper answer accoding to SQuAD:
Saint Bernadette Soubirous
F1: 1.0,	E: 1
