# Simple Question-Answering Demo with distilbert and ArXiv `astro-ph` Dataset

### Utility Functions

In [1]:
import zipfile
import json
import pandas as pd
import io
import fsspec

def fetch_arxiv_dataset(zip_url: str) -> pd.DataFrame:
    cols = ['id', 'title', 'abstract', 'categories']

    with fsspec.open(zip_url) as f:
        with zipfile.ZipFile(f) as archive:
            data = []
            json_file = archive.filelist[0]
            with archive.open(json_file) as f:
                for line in io.TextIOWrapper(f, encoding="latin-1"):
                    doc = json.loads(line)
                    lst = [doc['id'], doc['title'], doc['abstract'], doc['categories']]
                    data.append(lst)
                    
            df_data = pd.DataFrame(data=data, columns=cols)
    return df_data

# https://github.com/allenai/open-instruct/blob/main/eval/templates.py
def create_prompt_with_olmo_chat_format(messages, bos="|||IP_ADDRESS|||", eos="|||IP_ADDRESS|||", add_bos=True):
    formatted_text = ""
    for message in messages:
        if message["role"] == "system":
            formatted_text += "<|system|>\n" + message["content"] + "\n"
        elif message["role"] == "user":
            formatted_text += "<|user|>\n" + message["content"] + "\n"
        elif message["role"] == "assistant":
            formatted_text += "<|assistant|>\n" + message["content"].strip() + eos + "\n"
        else:
            raise ValueError(
                "Olmo chat template only supports 'system', 'user' and 'assistant' roles. Invalid role: {}.".format(message["role"])
                )
    formatted_text += "<|assistant|>\n"
    formatted_text = bos + formatted_text  # forcibly add bos
    return formatted_text

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)



### Retrieve documents (arXiv `astro-ph` abstracts)

This section retrieves the arXiv abstracts and creates documents
for loading into a vector database. You can skip running the following sections
if you have a local copy of the Qdrant Vector Database data ready to go.

In [2]:
from langchain_community.document_loaders import DataFrameLoader

Note: the following zip_url will be changed weekly.

In case you run the code for the first time or need the latest url,  please go to website of ArXiv Dataset to download the latest dataset.

#### You can find zip_url of that by clicking the info_directory-> more_info-> where_from of dataset locally.

In case you have downloaded the dataset locally, you can use file path directly (e.g. file_path).

In [4]:
# zip_url = "https://storage.googleapis.com/kaggle-data-sets/612177/8112112/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240420%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240420T072044Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=4205122187fe955292316d8c21c161d1a5cda0cad078505415764e112ed5502117eac4d4c492908387a31c00f84a0b5ff9591b0128fbff051a6f0f675604d95325162cf0c957033fcbdfe9f070945da28c00cfbce1b7f387804228f150ab3cb1489a9e7ce1c58f5ab8b3fe3cc5f9680d19366969b43e8e0905444416f1a77314582c3335eaee1d06e8859fb95631a5baa6a75212702383dfc628ed8cc0b34b9f9a1433cca752789d20af013022f828ef2d54ea653f03649bbf45c2b1139611e3621b5844c3f9a489a79724dee8b883f22bab92c9c08e96915561a98b3862e4e93bff86eaa613fd81d88798230fe31eee129e9dda76a2aaacb565700e41b7524f"

file_path = "/Users/amy/Desktop/archive.zip"

In [5]:
# Fetch the dataset containing all arXiv abstracts
df_data = fetch_arxiv_dataset(file_path)
# Filter the dataset to only include astro-ph category
astro_df = df_data[df_data.categories.str.contains('astro-ph')].reset_index(drop=True)
astro_df=astro_df[:1000]
print("Number of astro-ph papers: ", len(astro_df))

Number of astro-ph papers:  1000


In [6]:
# Eargerly load the dataframe full of abstracts
# to memory in the form of langchain Document objects
loader = DataFrameLoader(astro_df, page_content_column="abstract")
documents = loader.load()

### Document Embeddings to Qdrant Vector Database

In [3]:
from langchain_community.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceEmbeddings

In [4]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [5]:
import os

In [6]:
qdrant_path="./local_qdrant_first_1000_L6_v2"
qdrant_collection="arxiv_astro-ph_abstracts_first_1000_L6_v2"

In [7]:
# Setup the embedding, we are using the MiniLM model here
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [8]:
if os.path.exists(qdrant_path):
    print(f"Loading existing Qdrant collection '{qdrant_collection}'")
    from qdrant_client import QdrantClient
    # If the Qdrant Vector Database Collection already exists, load it
    client = QdrantClient(path=qdrant_path)
    qdrant = Qdrant(
        client=client,
        collection_name=qdrant_collection,
        embeddings=embedding
    )
else:
    print(f"Creating new Qdrant collection '{qdrant_collection}' from {len(documents)} documents")
    
    # Load the documents into a Qdrant Vector Database Collection
    # this will save locally in the current directory as sqlite
    qdrant = Qdrant.from_documents(
        documents,
        embedding,
        path=qdrant_path,
        collection_name=qdrant_collection,
    )
    

Loading existing Qdrant collection 'arxiv_astro-ph_abstracts_first_1000_L6_v2'


#### Build retriever

In [9]:
# Setup the retriever for later step
retriever = qdrant.as_retriever(search_type="mmr", search_kwargs={"k": 2})

### Question Answering

In [10]:
from pathlib import Path
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, pipeline

In [11]:
import torch

In [12]:
qa_model = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad", torch_dtype=torch.float32)


In [27]:
import time

def get_run_time(func):
    time_start = time.time()
    func()
    time_end = time.time()
    print(time_end - time_start)
    
def question_answering_func(question):
    print (f'Question: {question}\n')
    context_docs = retriever.invoke(question)
    context = " ".join([doc.page_content for doc in context_docs])

    qa_response = qa_model(question = question, context = context)

    print(f'\nAnswer: {qa_response["answer"]}\n')

    print("Explanation:\n"+context)
    
questions = [
    "",
    "What millimeter wavelength range is suggested as appropriate to look for dynamic signatures in the solar chromosphere according to the computations by Carlsson and Stein?",
    "How does the globular cluster mass function (GCMF) in the Milky Way vary with cluster half-mass density (rho_h)?",
    "What is the nature of the huge far-infrared luminosity of the Cloverleaf lensed QSO (H1413+117)?",
    "What is the behavior of the angular momentum Lz​ for nearly horizon-skimming orbits around a nearly extremal Kerr black hole, and how does this behavior compare to normal black hole orbits?",
    "What is the spatial relationship between the protostars and T-Tauri members in the IC 348 star cluster?",
    "What are the main observational findings regarding the X-ray and radio emissions from the Galactic non-thermal radio source G328.4+0.2?",
    "What are the unique advantages of radio astrometry in exoplanet discovery compared to other methods like radial velocity searches, coronagraphy, or optical interferometry?",
    "What is the determined contribution of the donor star in the H waveband in the spectrum of A0620-00?",
    "Which family of Jupiter Trojans in the L4 swarm is dominated by C- and P-type asteroids?",
    "What pattern in Faraday rotation measures (RMs) requires the presence of at least one large-scale magnetic reversal in the fourth Galactic quadrant?"
]

question = questions[10]
get_run_time(lambda: question_answering_func(question))


Question: What pattern in Faraday rotation measures (RMs) requires the presence of at least one large-scale magnetic reversal in the fourth Galactic quadrant?


Answer: counter-clockwise

Explanation:
  We present new Faraday rotation measures (RMs) for 148 extragalactic radio
sources behind the southern Galactic plane (253o < l < 356o, |b| < 1.5o), and
use these data in combination with published data to probe the large-scale
structure of the Milky Way's magnetic field. We show that the magnitudes of
these RMs oscillate with longitude in a manner that correlates with the
locations of the Galactic spiral arms. The observed pattern in RMs requries the
presence of at least one large-scale magnetic reversal in the fourth Galactic
quadrant, located between the Sagittarius- Carina and Scutum-Crux spiral arms.
To quantitatively compare our measurements to other recent studies, we consider
all available extragalactic and pulsar RMs in the region we have surveyed, and
jointly fit these data to