In [1]:
!pip -q install kagglehub
!pip -q install transformers
!pip -q install datasets
!pip install bert_score
!pip install evaluate

In [2]:
import kagglehub
import os
from datasets import load_dataset
import pandas as pd

# Step 1: Download the ArXiv dataset using kagglehub
print("Downloading ArXiv dataset from Kaggle...")
path = kagglehub.dataset_download("Cornell-University/arxiv")
print(f"Dataset downloaded to: {path}")

# List files in the downloaded directory to confirm what we have
print("\nFiles in the downloaded directory:")
for root, dirs, files in os.walk(path):
    for file in files:
        print(f" - {os.path.join(root, file)}")

Downloading ArXiv dataset from Kaggle...
Dataset downloaded to: /kaggle/input/arxiv

Files in the downloaded directory:
 - /kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json


In [3]:
import json
import os


def transform_paper_to_source(paper: dict) -> dict:
    """Convert single paper dict to source format or None if no abstract."""
    abstract = paper.get('abstract', '').strip()
    if not abstract:
        return None
    return {
        'text': abstract,
        'metadata': {
            'authors': paper.get('authors', []),
            'title': paper.get('title', ''),
            'update_date': paper.get('update_date', '')
        }
    }


def create_demo_jsonl(input_path: str, demo_output_path: str, demo_count: int = 20) -> None:
    """
    Stream through the input JSONL and write only the first `demo_count` valid sources
    to a new JSONL file.
    """
    demo_written = 0

    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(demo_output_path, 'w', encoding='utf-8') as outfile_demo:

        for line in infile:
            if demo_written >= demo_count:
                break

            paper = json.loads(line)
            source = transform_paper_to_source(paper)
            if source is None:
                continue

            outfile_demo.write(json.dumps(source, ensure_ascii=False) + '\n')
            demo_written += 1

    print(f"Saved {demo_written} demo sources to {demo_output_path}")


if __name__ == '__main__':
    input_path = os.path.expanduser(
        '/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'
    )
    demo_output_path = 'arxiv_20_sources.jsonl'

    create_demo_jsonl(input_path, demo_output_path)


Saved 20 demo sources to arxiv_20_sources.jsonl


In [4]:
import json
import os
from typing import List, Dict

def load_sources_jsonl(input_path: str) -> List[Dict]:
    sources: List[Dict] = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            sources.append(json.loads(line))
    return sources

# corpus = "arxiv_demo_20_sources.jsonl"
# sources = load_sources_jsonl(corpus)
corpus = "arxiv_20_sources.jsonl"
sources = load_sources_jsonl(corpus)
print(f"Loaded {len(sources)} sources")

Loaded 20 sources


In [5]:
## Some preparation steps -- if in google colab
!mkdir logs
import sys
# Clone the repository
!rm -rf Pleias-RAG-Library/
!git clone --quiet https://github.com/Pleias/Pleias-RAG-Library

# Install the cloned package in development mode
%cd Pleias-RAG-Library
!pip install -e . -q

%cd ..
sys.path.append('/content/Pleias-RAG-Library')

/content/Pleias-RAG-Library
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.1/294.1 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.4/98.4 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m100.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31

In [6]:
import dataset_processing as data
import handle_models as models
import importlib
import logging

# Clear existing handlers (Colab/Jupyter specific)
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Configure logging
logging.basicConfig(level=logging.INFO)

importlib.reload(models)
importlib.reload(data)

<module 'dataset_processing' from '/content/dataset_processing.py'>

In [51]:
import json
import os
import time
import pickle
import logging
from typing import List, Dict, Optional, Tuple
import torch
from transformers import pipeline, Pipeline, AutoTokenizer
from pleias_rag_interface import RAGWithCitations

from typing import TypedDict, Any, Optional

logger = logging.getLogger(__name__)

class SingleResult(TypedDict, total=False):
    response: Any
    time: float
    error: str

def load_qa_models(
    rag_model_name: str = "PleIAs/Pleias-RAG-350M",
    t5_model_name: str = "google/flan-t5-large",
    t5_task: str = "text2text-generation",
    device: int = 1,
    torch_dtype=torch.float16
) -> Tuple[Optional[RAGWithCitations], Optional[Pipeline]]:

    rag = None
    t5_ppl = None

    # Load RAG model
    try:
        rag = RAGWithCitations(model_path_or_name=rag_model_name)
        print("-------RAG Loaded correctly-------")
        logger.info(f"Successfully loaded RAG model '{rag_model_name}'")
    except Exception as e:
        logger.error(f"Failed to load RAG model '{rag_model_name}': {e}")

    logger.info("-----------------------------------------------------------------")

    # Load T5 pipeline (instruction-tuned + sampling)
    try:
        tokenizer = AutoTokenizer.from_pretrained(t5_model_name)
        if tokenizer.pad_token_id is None:
            tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token_id})

        t5_ppl = pipeline(
            task=t5_task,
            model=t5_model_name,
            torch_dtype=torch_dtype,
            device=device,
            # enable nucleus sampling
            do_sample=False,
            num_beams=4,
            # ensure a pad token
            pad_token_id=tokenizer.eos_token_id
        )
        print("--------T5 Loaded correctly---------")
        logger.info(f"Loaded instruction-tuned T5 '{t5_model_name}' with sampling")
    except Exception as e:
        logger.error(f"Failed to load T5 pipeline '{t5_model_name}': {e}")
        t5_ppl = None

    return rag, t5_ppl

if __name__ == "__main__":
    # Configure logging
    logging.basicConfig(level=logging.INFO)

    rag, t5 = load_qa_models()
    # model_check(rag, t5)

CUDA available: False
Loading model with transformers from PleIAs/Pleias-RAG-350M...


INFO:__main__:Successfully loaded RAG model 'PleIAs/Pleias-RAG-350M'
INFO:__main__:-----------------------------------------------------------------


Model loaded successfully with transformers
-------RAG Loaded correctly-------


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
INFO:__main__:Loaded instruction-tuned T5 'google/flan-t5-large' with sampling


--------T5 Loaded correctly---------


In [56]:
import time
import logging
from typing import List, Dict, Any, Optional
from transformers import Pipeline
from pleias_rag_interface import RAGWithCitations

logger = logging.getLogger(__name__)

def query_rag(
    query: str,
    sources: List[Dict],
    rag: RAGWithCitations
    ) -> Dict[str, Any]:
    """
    Run only the RAG-with-citations model.
    Returns a dict with keys: 'response', 'time', and optionally 'error'.
    """
    # results = {}
    try:
        start = time.time()
        rag_resp = rag.generate(query, sources)
        elapsed = time.time() - start
        return {
          'response': rag_resp,
          'time': elapsed
        }
    except Exception as e:
        logger.error(f"Error querying RAG model: {e}")
        return {
            'response': None,
            'time': None,
            'error': str(e)
        }
    # return results

def query_t5(
    query: str,
    t5_ppl: Pipeline,
    prefix: str = "Answer the following question:"
) -> Dict[str, Any]:
    """
    Pure LLM call—no retrieval context.
    Returns:
      {
        'response': <str generated_text or None>,
        'time':    <float seconds> ,
        'error':   <str if error>
      }
    """
    try:
        prompt = f"{prefix}\n\nQuestion: {query}\nAnswer:"
        # prompt = f"{prefix} {query}"
        start = time.time()
        out = t5_ppl(prompt, max_length=200)     # returns [ { "generated_text": ... } ]
        elapsed = time.time() - start

        gen = out[0].get("generated_text", "").strip()
        return {'response': gen, 'time': elapsed}
    except Exception as e:
        logger.error(f"T5 generation error: {e}")
        return {'response': None, 'time': None, 'error': str(e)}

In [53]:
# query = "Does the dark matter field fluid model agree very well with the current behavior of the Earth–Moon system?"
query = "Is the evolution of Earth-Moon system described by the dark matter field?"
# query = "What is the capital of France?"

In [58]:
rag_result = query_rag(query, sources, rag)
print('aaa\n\n')
if rag_result.get('error'):
    print("RAG error:", rag_result['error'])
else:
    print("RAG answer:", rag_result['response']['processed']['clean_answer'])
    print("\n\n")
    print("RAG answer 2:", rag_result['response']['processed']['citations'])
    print(f"(took {rag_result['time']:.2f}s)")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


aaa


RAG answer: The evolution of Earth-Moon system presents an intriguing case study in astrophysics, particularly regarding dark matter. Here's what we know about its current understanding:

The Earth-Moon system has been studied through a detailed analysis of the dark matter field, which has been compared against geological and fossil evidence[1].

The dark matter component plays a crucial role in this evolution. The dark matter field has been identified as a significant factor in the system's evolution, with its contribution being substantial. This is particularly evident in the case of the $k,£\alpha$ norm, where the dark matter field has been shown to be approximately $k,£\alpha$[2].

The dark matter component has been extensively studied, with researchers working on developing methods to calculate these parameters. These calculations have revealed that the dark matter field field has a specific behavior that differs from the general pattern observed in the Earth-Moon system[3].

In [59]:
#  ——— Pure T5 LLM ——————————————————————
t5_out = query_t5(query, t5)
if t5_out.get('error'):
    print("T5 error:", t5_out['error'])
else:
    print("T5 answer:", t5_out['response'])
    print(f"(took {t5_out['time']:.2f}s)")

T5 answer: no
(took 12.54s)


In [32]:
# import json
# from evaluate import load

# with open("eval_data.json", "r", encoding="utf-8") as f:
#     eval_data = json.load(f)

# # 2) generate answers
# results = []
# for item in eval_data:
#     q = item["query"]
#     # rag_out = query_rag(q, sources, rag)
#     t5_out = query_t5(q, t5)
#     # rag_ans = rag_out["response"]["processed"]["answer"] if rag_out.get("response") else ""
#     t5_ans = t5_out["response"] or ""
#     results.append((item["reference"], t5_ans))

# refs, t5_preds = zip(*results)

# # 4) compute BERTScore
# bertscore = load("bertscore")
# # b_rag = bertscore.compute(predictions=rag_preds, references=refs, lang="en")
# b_t5  = bertscore.compute(predictions=t5_preds, references=refs, lang="en")

# print("BERTScore F1: RAG =",
#       "T5 =", sum(b_t5["f1"])/len(b_t5["f1"]))


INFO:absl:Using default tokenizer.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ROUGE-L: RAG = T5 = 0.0
BERTScore F1: RAG = T5 = 0.8249756097793579
