# ArXiv retrieval

In [None]:
!pip install arxiv

Collecting arxiv
  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.1.3-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=390ed0ed0e3a6879e6fcbfea2ee54e687ca26628d52ab8a3b22872784a05a10f
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce60524393c0b92ae83cf6c6e89c5
Successfully built sgmllib3k
Installing collected packag

In [None]:
!pip install arxiv pandas scikit-learn

import arxiv
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

query = '("Multimodal Large Language Model*" OR MLLM* OR MM-LLM* OR "Information Fusion" OR "Multimodal Learn*" OR "Joint Learn*" OR "Cross Learn*") AND (Healthcare OR Medicine OR Health)'

def fetch_all_arxiv_results(query, total_results=800, results_per_page=100):
    all_results = []
    client = arxiv.Client(page_size=results_per_page)

    for start in range(0, total_results, results_per_page):
        search = arxiv.Search(
            query=query,
            max_results=results_per_page,
            sort_by=arxiv.SortCriterion.Relevance
        )

        try:
            for result in search.results():
                all_results.append({
                    "Database": "arXiv",
                    "DOI": result.entry_id.split('/')[-1],
                    "Title": result.title,
                    "Abstract": result.summary,
                    "Year": result.updated.year,
                    "Authors": ", ".join([author.name for author in result.authors])
                })
        except Exception as e:
            print(f"Error fetching results from start={start}: {e}")
            break

    return all_results

def extract_keywords(texts, num_keywords=5):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=num_keywords)
    tfidf_matrix = vectorizer.fit_transform(texts)
    keywords = vectorizer.get_feature_names_out()
    return keywords


print("Fetching results for Multimodal papers in Healthcare from arXiv...")
results = fetch_all_arxiv_results(query, total_results=800, results_per_page=100)

if not results:
    print("No results found. Please check the query syntax or try different keywords.")
else:

    df = pd.DataFrame(results)
    df["Text"] = df["Title"] + " " + df["Abstract"]
    df["Keywords"] = df["Text"].apply(lambda x: ", ".join(extract_keywords([x])))
    df.drop(columns=["Text"], inplace=True)

    csv_filename = "multimodal_healthcare_complete_results.csv"
    df.to_csv(csv_filename, index=False)
    print(f"Results saved to {csv_filename}")

    print(df.head())


Collecting arxiv
  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.1.3-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=23db6e8e3a1bfb63f7304cc75d5e6982d7834950587114c1dd75ee0d732b352c
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce60524393c0b92ae83cf6c6e89c5
Successfully built sgmllib3k
Installing collected packag

  for result in search.results():


Results saved to multimodal_healthcare_complete_results.csv
  Database           DOI                                              Title  \
0    arXiv  2412.14660v1  Unveiling Uncertainty: A Deep Dive into Calibr...   
1    arXiv  2405.08603v2  A Comprehensive Survey of Large Language Model...   
2    arXiv  2412.00103v1  MLLM-Search: A Zero-Shot Approach to Finding P...   
3    arXiv  2410.01812v5  From Text to Multimodality: Exploring the Evol...   
4    arXiv  2406.19280v4  HuatuoGPT-Vision, Towards Injecting Medical Vi...   

                                            Abstract  Year  \
0  Multimodal large language models (MLLMs) combi...  2024   
1  Since the release of ChatGPT and GPT-4, large ...  2024   
2  Robotic search of people in human-centered env...  2024   
3  Large Language Models (LLMs) have rapidly evol...  2024   
4  The rapid development of multimodal large lang...  2024   

                                             Authors  \
0  Zijun Chen, Wenbo Hu, Guande He, 

In [None]:
!pip install arxiv pandas scikit-learn

import arxiv
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

query = '("Multimodal" OR "Multi-modal" OR "Information Fusion" OR "Multimodal Learning") AND (Healthcare OR Medicine OR Health)'

def fetch_all_arxiv_results(query, results_per_page=100):
    all_results = []
    client = arxiv.Client(page_size=results_per_page)
    start = 0

    while True:
        search = arxiv.Search(
            query=query,
            max_results=results_per_page,
            sort_by=arxiv.SortCriterion.Relevance
        )

        try:
            page_results = list(search.results())
            if not page_results:
                break
            for result in page_results:
                all_results.append({
                    "Database": "arXiv",
                    "DOI": result.entry_id.split('/')[-1],
                    "Title": result.title,
                    "Abstract": result.summary,
                    "Year": result.updated.year,
                    "Authors": ", ".join([author.name for author in result.authors])
                })
        except Exception as e:
            print(f"Error fetching results: {e}")
            break

        start += results_per_page

    return all_results

def extract_keywords(texts, num_keywords=5):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=num_keywords)
    tfidf_matrix = vectorizer.fit_transform(texts)
    keywords = vectorizer.get_feature_names_out()
    return keywords

print("Fetching all results for Multimodal papers in Healthcare from arXiv...")
results = fetch_all_arxiv_results(query)

if results:
    df = pd.DataFrame(results)
    df["Text"] = df["Title"] + " " + df["Abstract"]
    df["Keywords"] = df["Text"].apply(lambda x: ", ".join(extract_keywords([x])))
    df.drop(columns=["Text"], inplace=True)

    csv_filename = "arxiv_all_multimodal_healthcare_results_with_keywords.csv"
    df.to_csv(csv_filename, index=False)
    print(f"All papers saved to {csv_filename}")
    print(df.head())

else:
    print("No results found for the given query.")

Fetching all results for Multimodal papers in Healthcare from arXiv...


  page_results = list(search.results())


# PubMed retrieval

In [None]:
!pip install metapub pandas tqdm


import pandas as pd
from metapub import PubMedFetcher
from tqdm import tqdm

def pubmed_search(search, n_articles=1000):
    fetch = PubMedFetcher()

    pmids = fetch.pmids_for_query(search, retmax=n_articles)

    # Initialize dictionaries to store data
    articles = {}
    titles = {}
    abstracts = {}
    authors = {}
    years = {}
    citations = {}
    dois = {}

    # Loop through PMIDs with a progress bar
    for pmid in tqdm(pmids, desc="Fetching PubMed articles"):
        if pmid is not None:
            article = fetch.article_by_pmid(pmid)
            articles[pmid] = article
            titles[pmid] = article.title
            abstracts[pmid] = article.abstract
            authors[pmid] = article.authors
            years[pmid] = article.year
            dois[pmid] = article.doi
        else:
            print(f"Couldn't retrieve pmid {pmid}")

    # Create a DataFrame
    df = pd.DataFrame({
        'doi': dois.values(),
        'title': titles.values(),
        'abstract': abstracts.values(),
        'year': years.values(),
        'authors': authors.values(),
    })

    return df

In [None]:
QUERY = """
("Multimodal Large Language Model*" OR MLLM* OR MM-LLM* OR "Information Fusion" OR "Multimodal Learn*" OR "Joint Learn*" OR "Cross Learn*") AND (Healthcare OR Medicine OR Health)
"""

pubmed = pubmed_search(QUERY)

# Scopus retrieval

In [None]:
!pip install pybliometrics pandas

import pybliometrics
from pybliometrics.scopus import ScopusSearch
import pandas as pd


def scopus_search(query, verbose=True, subscriber=False):
  pybliometrics.scopus.init(keys="9bfd3532db84db8ee142557d9d1b654f") # non-subscriber key

  r = ScopusSearch(query, verbose=verbose, subscriber=subscriber) # does not allow retrievals over 5000 papers
  df = pd.DataFrame(r.results)
  df = df[["doi", "title", "description", "coverDisplayDate", "author_names", "authkeywords"]]
  df = df.rename(columns={"doi": "Doi", "title": "Title", "description": "Abstract", "coverDisplayDate": "Year", "author_names": "Authors", "authkeywords": "Keywords"})
  df["Year"] = df["Year"].str.split(" ").str[-1]
  df["Database"] = "Scopus"

  return df

In [None]:
QUERY1 = """
("Multimodal Large Language Model*" OR MLLM* OR MM-LLM* OR "Information Fusion" OR "Multimodal Learn*" OR "Joint Learn*" OR "Cross Learn*") AND (Healthcare OR Medicine OR Health) AND PUBYEAR = 2025
""" # this search just contains papers from 2025, since 2024 already has over 16k papers and exceeds the limits to non-subscribers

scopus = scopus_search(QUERY1)

# IEEEXplore retrieval

In [None]:
!pip install ieeexplore-api

import pandas as pd
from ieeexplore_api import IEEExploreAPI

def ieee_search(query):
    try:
        api = IEEExploreAPI("a55guwmbva59yxf6kfzwwuks") # still waiting for approval
        results = api.search(query)

        if not results:
            print("No results found for the query.")
            return None

        data = []
        for record in results['records']:
            authors = ', '.join(record.get('authors', []))
            keywords = ', '.join(record.get('keywords', []))
            data.append({
                'Database': 'IEEE Xplore',
                'DOI': record.get('doi'),
                'Title': record.get('title'),
                'Abstract': record.get('abstract'),
                'Year': record.get('publication_year'),
                'Authors': authors,
                'Keywords': keywords,
            })

        return pd.DataFrame(data)
    except Exception as e:
        print(f"Error during IEEE Xplore search: {e}")
        return None

In [None]:
ieee = ieee_search(QUERY)

# WOS retrieval

In [None]:
!pip install wosplus

from wosplus import WosClient
import pandas as pd

def wos_search(query):
    try:
        wos_client = WosClient(key="API_KEY")  # still waiting for approval
        query_results = wos_client.query(query)

        if not query_results:
            print("No results found for the query.")
            return None

        data = []
        for record in query_results:
            data.append({
                'Database': 'Web of Science',
                'DOI': record.get('DI'),
                'Title': record.get('TI'),
                'Abstract': record.get('AB'),
                'Year': record.get('PY'),
                'Authors': record.get('AU')
            })

        return pd.DataFrame(data)

    except Exception as e:
        print(f"Error during Web of Science search: {e}")
        return None

In [None]:
wos = wos_search(QUERY)

In [None]:
!pip install arxiv pandas PyPDF2 scikit-learn tqdm

# Import required libraries
import arxiv
import pandas as pd
import requests
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from PyPDF2 import PdfReader
import os

# Define search query
query = (
    '("Multimodal Large Language Model*" OR MLLM* OR MM-LLM* OR "Information Fusion" '
    'OR "Multimodal Learn*" OR "Cross-modal" OR "Cross-modal Learning" OR "Cross-modal Fusion") '
    'AND (Healthcare OR Medicine OR Health)'
)

# Directory to store downloaded PDFs
pdf_dir = "arxiv_pdfs"
os.makedirs(pdf_dir, exist_ok=True)

# Function to fetch results from ArXiv
def fetch_all_arxiv_results(query, results_per_page=100):
    all_results = []
    client = arxiv.Client(page_size=results_per_page)
    start = 0

    while True:
        search = arxiv.Search(
            query=query,
            max_results=results_per_page,
            sort_by=arxiv.SortCriterion.Relevance
        )
        try:
            page_results = list(search.results())
            if not page_results:
                break
            for result in page_results:
                all_results.append({
                    "Title": result.title,
                    "Abstract": result.summary,
                    "Authors": ", ".join([author.name for author in result.authors]),
                    "Year": result.updated.year,
                    "PDF_URL": result.pdf_url
                })
        except Exception as e:
            print(f"Error fetching results: {e}")
            break
    return all_results

# Function to download PDFs
def download_pdf(pdf_url, filename):
    try:
        response = requests.get(pdf_url)
        with open(filename, "wb") as f:
            f.write(response.content)
        return True
    except Exception as e:
        print(f"Failed to download {pdf_url}: {e}")
        return False

def extract_text_from_pdf(filepath):
    try:
        reader = PdfReader(filepath)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return ""

def extract_keywords(texts, num_keywords=5):
    vectorizer = TfidfVectorizer(stop_words="english", max_features=num_keywords)
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer.get_feature_names_out()

# Fetch all results
print("Fetching all results from ArXiv...")
results = fetch_all_arxiv_results(query)

if results:
    df = pd.DataFrame(results)

    # Download PDFs and extract content
    print("Downloading and processing PDFs...")
    full_texts = []
    for index, row in tqdm(df.iterrows(), total=len(df)):
        pdf_filename = os.path.join(pdf_dir, f"{index}.pdf")
        if download_pdf(row["PDF_URL"], pdf_filename):
            full_text = extract_text_from_pdf(pdf_filename)
            full_texts.append(full_text)
        else:
            full_texts.append("")

    # Add full texts to DataFrame
    df["Full_Text"] = full_texts

    # Extract keywords from full text
    print("Extracting keywords from full text...")
    df["Keywords"] = df["Full_Text"].apply(lambda x: ", ".join(extract_keywords([x])))

    # Save results to CSV
    csv_filename = "arxiv_full_text_results.csv"
    df.to_csv(csv_filename, index=False)
    print(f"Results saved to {csv_filename}")
else:
    print("No results found.")