# Introduction

The dataset is from the paper: 
Github repository code with the data files, including train and test datasets : 


In [None]:
! pip install anytree

In [None]:
! pip install chromadb sentence-transformers

# Imports

Choose HGR Environment

In [1]:
import pickle

# Dataset Loading

## Train Doc info - Test doc info

In [14]:
with open(r"../../../Data/MMVQA/train_doc_info_github_0910.pkl", 'rb') as file:
    train_docs = pickle.load(file)

In [56]:
with open(r"../../../Data/MMVQA/val_doc_info_github_0910.pkl", 'rb') as file:
    val_docs = pickle.load(file) #Outputs a dictionary

### Structure

Structure of the each value for key

```
train_docs["234434"] = {
  'page_info': {
    0: {
      'objects': {
        0: {'bbox': [...], 'text': '...', 'class': 'title'},
        1: {'bbox': [...], 'text': '...', 'class': 'abstract'},
        ...
      }
    },
    1: {
      'objects': {
        2: {...}, 3: {...}, ...
      }
    },
    ...
  }
}
```

In [26]:
from anytree import Node, RenderTree

def build_tree(doc_id, doc_data):
    root = Node(f"Document {doc_id}")
    page_nodes = {}
    for page_num, page_data in doc_data['page_info'].items():
        page_node = Node(f"Page {page_num}", parent=root)
        page_nodes[page_num] = page_node
        for obj_id, obj_data in page_data['objects'].items():
            text_preview = obj_data['text'][:30].replace('\n', ' ') + "..."
            obj_class = obj_data['class']
            Node(f"Obj {obj_id} [{obj_class}]: {text_preview}", parent=page_node)
    return root

tree = build_tree("PMC6366628", train_docs["PMC6366628"])
for pre, _, node in RenderTree(tree):
    print(f"{pre}{node.name}")


Document PMC6366628
├── Page 0
│   ├── Obj 0 [title]: Alternative Lengthening of Tel...
│   └── Obj 1 [abstract]: Alternative lengthening of tel...
├── Page 1
│   ├── Obj 2 [abstract]: Alternative lengthening of tel...
│   ├── Obj 3 [section]: INTRODUCTION ...
│   └── Obj 4 [paragraph]: The maintenance of telomeres i...
├── Page 2
│   ├── Obj 5 [paragraph]: remains largely unclear. A bet...
│   ├── Obj 6 [paragraph]: One of the hallmarks of ALT is...
│   ├── Obj 7 [paragraph]: In addition to APBs, ALT+ cell...
│   └── Obj 8 [paragraph]: ALT has been long speculated t...
├── Page 3
│   ├── Obj 9 [paragraph]: synthesis at ALT telomeres is ...
│   ├── Obj 10 [paragraph]: In this study, we developed an...
│   ├── Obj 11 [section]: RESULTS ...
│   ├── Obj 12 [section]: An Assay to Monitor DNA Synthe...
│   └── Obj 13 [paragraph]: To understand the process of A...
├── Page 4
│   ├── Obj 14 [paragraph]: established an assay to visual...
│   ├── Obj 15 [paragraph]: The ATSA assay allowed us to

In [23]:
import json

print(json.dumps(train_docs["PMC6366628"], indent=2))


{
  "page_info": {
    "0": {
      "objects": {
        "0": {
          "bbox": [
            90.0,
            164.0,
            486.0,
            127.0
          ],
          "text": "Alternative Lengthening of Telomeres through Two Distinct  Break-Induced Replication Pathways ",
          "class": "title"
        },
        "1": {
          "bbox": [
            107,
            528,
            502,
            363
          ],
          "text": "Alternative lengthening of telomeres (ALT) is a telomerase-independent but recombinationdependent pathway that maintains telomeres. Here, we describe an assay to visualize ALTmediated telomeric DNA synthesis in ALT-associated PML bodies (APBs) without DNAdamaging agents or replication inhibitors. Using this assay, we find that ALT occurs through two  distinct mechanisms. One of the ALT mechanisms requires RAD52, a protein implicated in breakinduced DNA replication (BIR). We demonstrate that RAD52 directly promotes telomeric D-loop  for

In [None]:
train_docs.get('PMC6366628').keys() # There is only one key 'page_info'
train_docs.get('PMC6366628').get('page_info')
train_docs.get('PMC6366628').get('page_info').keys()
train_docs.get('PMC6366628').get('page_info').get(0)

{'objects': {0: {'bbox': [90.0, 164.0, 486.0, 127.0],
   'text': 'Alternative Lengthening of Telomeres through Two Distinct  Break-Induced Replication Pathways ',
   'class': 'title'},
  1: {'bbox': [107, 528, 502, 363],
   'text': 'Alternative lengthening of telomeres (ALT) is a telomerase-independent but recombinationdependent pathway that maintains telomeres. Here, we describe an assay to visualize ALTmediated telomeric DNA synthesis in ALT-associated PML bodies (APBs) without DNAdamaging agents or replication inhibitors. Using this assay, we find that ALT occurs through two  distinct mechanisms. One of the ALT mechanisms requires RAD52, a protein implicated in breakinduced DNA replication (BIR). We demonstrate that RAD52 directly promotes telomeric D-loop  formation in vitro and is required for maintaining telomeres in ALT-positive cells. Unexpectedly,  however, RAD52 is dispensable for C-circle formation, a hallmark of ALT. In RAD52-knockout  ALT cells, C-circle formation and RAD5

In [32]:
train_docs.get('PMC6366628').get('page_info').get(0).get('objects')

{0: {'bbox': [90.0, 164.0, 486.0, 127.0],
  'text': 'Alternative Lengthening of Telomeres through Two Distinct  Break-Induced Replication Pathways ',
  'class': 'title'},
 1: {'bbox': [107, 528, 502, 363],
  'text': 'Alternative lengthening of telomeres (ALT) is a telomerase-independent but recombinationdependent pathway that maintains telomeres. Here, we describe an assay to visualize ALTmediated telomeric DNA synthesis in ALT-associated PML bodies (APBs) without DNAdamaging agents or replication inhibitors. Using this assay, we find that ALT occurs through two  distinct mechanisms. One of the ALT mechanisms requires RAD52, a protein implicated in breakinduced DNA replication (BIR). We demonstrate that RAD52 directly promotes telomeric D-loop  formation in vitro and is required for maintaining telomeres in ALT-positive cells. Unexpectedly,  however, RAD52 is dispensable for C-circle formation, a hallmark of ALT. In RAD52-knockout  ALT cells, C-circle formation and RAD52-independent AL

In [39]:
train_docs.get('PMC6366628').get('page_info').get(0).get('objects')

{0: {'bbox': [90.0, 164.0, 486.0, 127.0],
  'text': 'Alternative Lengthening of Telomeres through Two Distinct  Break-Induced Replication Pathways ',
  'class': 'title'},
 1: {'bbox': [107, 528, 502, 363],
  'text': 'Alternative lengthening of telomeres (ALT) is a telomerase-independent but recombinationdependent pathway that maintains telomeres. Here, we describe an assay to visualize ALTmediated telomeric DNA synthesis in ALT-associated PML bodies (APBs) without DNAdamaging agents or replication inhibitors. Using this assay, we find that ALT occurs through two  distinct mechanisms. One of the ALT mechanisms requires RAD52, a protein implicated in breakinduced DNA replication (BIR). We demonstrate that RAD52 directly promotes telomeric D-loop  formation in vitro and is required for maintaining telomeres in ALT-positive cells. Unexpectedly,  however, RAD52 is dispensable for C-circle formation, a hallmark of ALT. In RAD52-knockout  ALT cells, C-circle formation and RAD52-independent AL

In [71]:
train_docs.get('PMC6366628').get('page_info').get(0).get('objects').get(0)

{'bbox': [90.0, 164.0, 486.0, 127.0],
 'text': 'Alternative Lengthening of Telomeres through Two Distinct  Break-Induced Replication Pathways ',
 'class': 'title'}

In [47]:
for key in train_docs.get('PMC6366628').keys():
    print(key)

page_info


In [None]:
for doc_id, key in train_docs.items():
    for key_val in key.keys():
        if key_val !='page_info':
            print(key_val)
            
for doc_id, key in val_docs.items():
    for key_val in key.keys():
        if key_val !='page_info':
            print(key_val)
            
# there is only on key for all the documents that is 'page_info'
# under page_info key there is only one dictionary, that if 'objects'
# under 'objects' dictionary the keys are 0, 1, .....n 
# under those keys the dictionaries are 'bbox', 'class', 'text'
# under the class the categories are :
"""{'abstract',
 'figure',
 'figure caption',
 'list',
 'paragraph',
 'section',
 'table',
 'table caption',
 'title'}"""

In [87]:
figures = {}

for doc_id, doc in train_docs.items():
    for page_num, page_data in doc["page_info"].items():
        objects = page_data.get("objects", {})
        for obj_id, obj in objects.items():
            if obj.get("class") == "figure":
                if doc_id not in figures:
                    figures[doc_id] = {}
                figures[doc_id][obj_id] = obj  # includes bbox, text, class


In [None]:
figures # respective doc it, object od and the object for figures

{'PMC6366628': {78: {'bbox': [185, 61, 483, 493],
   'text': '',
   'class': 'figure'},
  80: {'bbox': [186, 60, 483, 494], 'text': '', 'class': 'figure'},
  82: {'bbox': [196, 63, 474, 493], 'text': '', 'class': 'figure'},
  84: {'bbox': [190, 60, 481, 495], 'text': '', 'class': 'figure'},
  86: {'bbox': [128, 61, 542, 495], 'text': '', 'class': 'figure'},
  88: {'bbox': [120, 61, 555, 385], 'text': '', 'class': 'figure'},
  90: {'bbox': [178, 62, 490, 493], 'text': '', 'class': 'figure'}},
 'PMC6974665': {26: {'bbox': [192, 500, 552, 702],
   'text': '',
   'class': 'figure'},
  40: {'bbox': [191, 88, 550, 258], 'text': '', 'class': 'figure'},
  48: {'bbox': [42, 88, 549, 276], 'text': '', 'class': 'figure'},
  54: {'bbox': [192, 88, 551, 277], 'text': '', 'class': 'figure'}},
 'PMC6353764': {23: {'bbox': [103, 77, 497, 384],
   'text': '',
   'class': 'figure'},
  31: {'bbox': [52, 438, 278, 614], 'text': '', 'class': 'figure'},
  72: {'bbox': [92, 68, 489, 375], 'text': '', 'class'

In [90]:
set(figures)

{'PMC6348514',
 'PMC6613398',
 'PMC9997075',
 'PMC9968548',
 'PMC9330100',
 'PMC9073893',
 'PMC6137322',
 'PMC2606939',
 'PMC7584054',
 'PMC8365320',
 'PMC6909274',
 'PMC7061158',
 'PMC8411060',
 'PMC6351431',
 'PMC6140486',
 'PMC7555572',
 'PMC6930706',
 'PMC4405942',
 'PMC6711115',
 'PMC7391895',
 'PMC7750813',
 'PMC7724518',
 'PMC6916416',
 'PMC7851460',
 'PMC6029850',
 'PMC6504999',
 'PMC7797250',
 'PMC9058671',
 'PMC4134506',
 'PMC9999093',
 'PMC7294844',
 'PMC5754864',
 'PMC7413733',
 'PMC5381255',
 'PMC7078568',
 'PMC6366481',
 'PMC3507038',
 'PMC8524316',
 'PMC7521958',
 'PMC9071406',
 'PMC9935049',
 'PMC6733775',
 'PMC7607087',
 'PMC6720536',
 'PMC8823907',
 'PMC7104257',
 'PMC8785110',
 'PMC4769425',
 'PMC6374573',
 'PMC5574444',
 'PMC5598325',
 'PMC8664647',
 'PMC7757747',
 'PMC10092164',
 'PMC6289212',
 'PMC6309152',
 'PMC4710825',
 'PMC3498666',
 'PMC2900026',
 'PMC6844267',
 'PMC9242090',
 'PMC6131382',
 'PMC6512556',
 'PMC9867920',
 'PMC7412872',
 'PMC7895514',
 'PMC6850

In [12]:
type(train_df.keys())

dict_keys

In [15]:
doc_lengths = [len(doc) for doc in train_docs.values()]
print("Min:", min(doc_lengths), "Max:", max(doc_lengths), "Average:", sum(doc_lengths)/len(doc_lengths))


Min: 1 Max: 1 Average: 1.0


In [7]:
data.get("PMC6449318").get("page_info").get(1)

AttributeError: 'NoneType' object has no attribute 'get'

## QA pairs - train - Evaluation

In [1]:
import pandas as pd

In [2]:
"Data\MMVQA\mmvqa_qa_pairs_val_github.csv"
df = pd.read_csv("../../../Data/MMVQA/mmvqa_qa_pairs_val_github.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,question,answer_objt_id,document_id,page_rage,context
0,0,0,What are some of the challenging conditions th...,[6],PMC6436297,"(0, 1)",One of the most important stages of life is ad...
1,1,1,Why are psychiatric disorders during adolescen...,[6],PMC6436297,"(0, 1)",One of the most important stages of life is ad...
2,2,2,What was the prevalence rate for mental disord...,[7],PMC6436297,"(0, 1)",One of the most important stages of life is ad...
3,3,3,What percentage of secondary school boys in Sa...,[7],PMC6436297,"(0, 1)",One of the most important stages of life is ad...
4,4,4,What percentage of adult sufferers of anxiety ...,"[8, 9]",PMC6436297,"(0, 2)",One of the most important stages of life is ad...


# Vector Store

## Chroma DB

In [33]:
# ! pip install chromadb sentence-transformers

Collecting chromadb
  Using cached chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting pydantic>=1.9 (from chromadb)
  Using cached pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Using cached chroma_hnswlib-0.7.6-cp39-cp39-win_amd64.whl.metadata (262 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Using cached fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.21.0-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Using cached onnxruntime-1.19.2-cp39-cp39-win_amd64.whl.me

In [35]:
import chromadb
from sentence_transformers import SentenceTransformer
import uuid

In [None]:
# Step 4: Initialize ChromaDB (in-memory or persistent)
import chromadb.config
from chromadb.config import Settings

client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory="./chroma_store"))

collection = client.get_or_create_collection(name="pdf_docs")

In [None]:
# Step 1: Load your train_docs dictionary (already done)
# Example: train_docs = pickle.load(open("train_doc_info.pkl", "rb"))

# Step 2: Flatten the structure
flat_chunks = []
for doc_id, doc in train_docs.items():
    for page_num, page in doc["page_info"].items():
        for obj_id, obj in page["objects"].items():
            flat_chunks.append({
                "id": str(uuid.uuid4()),  # unique ID for ChromaDB
                "text": obj["text"],
                "metadata": {
                    "doc_id": doc_id,
                    "page": int(page_num),
                    "object_id": int(obj_id),
                    "class": obj["class"]
                }
            })

# Step 3: Use SentenceTransformer (CPU will be used by default)
model = SentenceTransformer("all-MiniLM-L6-v2")  # small + fast
texts = [chunk["text"] for chunk in flat_chunks]
embeddings = model.encode(texts, show_progress_bar=True)



# Step 5: Add to vector store
collection.add(
    documents=texts,
    embeddings=embeddings.tolist(),
    metadatas=[chunk["metadata"] for chunk in flat_chunks],
    ids=[chunk["id"] for chunk in flat_chunks]
)


In [38]:
query = "What is alternative lengthening of telomeres?"
query_vector = model.encode([query])[0]

results = collection.query(
    query_embeddings=[query_vector],
    n_results=5
)

for i, doc in enumerate(results["documents"][0]):
    print(f"\nResult {i+1}")
    print("Text:", doc[:300], "...")
    print("Metadata:", results["metadatas"][0][i])


NameError: name 'collection' is not defined

In [None]:
client = chromadb.Client(Settings(persist_directory="./chroma_store"))
collection = client.get_collection("pdf_docs")