# Parse ChemRxiv

## Download papers from the ChemRxiv

In [None]:
import os

import chemrxiv
import pandas as pd

client = chemrxiv.Client()
search = chemrxiv.Search(
    limit=29950,  # this is the total # papers of papers as of June 5 2025
    # limit=2995,
    sort=chemrxiv.SortCriterion.CITATION_COUNT_DESC,
)
# This takes ~1.5hrs to run!
results = list(client.results(search))

In [None]:
directory = "../../../data/"


def safe_convert_to_string(value):
    """Convert any value to a string, handling lists, dicts, etc."""
    if value is None:
        return ""
    elif isinstance(value, (list | tuple)):
        return "; ".join(str(item) for item in value)
    elif isinstance(value, dict):
        return str(
            value
        )  # or you could do json.dumps(value) for better formatting
    else:
        return str(value)


def extract_authors(authors_list):
    """Extract author names from the authors list"""
    if not authors_list:
        return ""
    author_names = []
    for author in authors_list:
        if isinstance(author, dict):
            first_name = author.get("firstName", "")
            last_name = author.get("lastName", "")
            full_name = f"{first_name} {last_name}".strip()
            if full_name:
                author_names.append(full_name)
    return "; ".join(author_names)


def extract_categories(categories_list):
    """Extract category names from the categories list"""
    if not categories_list:
        return ""
    category_names = []
    for category in categories_list:
        if isinstance(category, dict) and "name" in category:
            category_names.append(category["name"])
    return "; ".join(category_names)


def extract_metric_value(metrics_list, description):
    """Extract specific metric value by description"""
    if not metrics_list:
        return ""
    for metric in metrics_list:
        if (
            isinstance(metric, dict)
            and metric.get("description") == description
        ):
            return str(metric.get("value", ""))
    return ""


# Create empty DataFrame
df = pd.DataFrame(
    columns=[
        "id",
        "title",
        "authors",
        "abstract",
        "doi",
        "published_date",
        "updated_date",
        "categories",
        "license",
        "pdf_url",
        "views_count",
        "read_count",
        "citation_count",
        "keywords",
        "text_paper",
        "text_si",
    ]
)
os.makedirs(directory, exist_ok=True)
for paper in results:
    try:
        # Use the raw data directly
        paper_raw = paper._raw
        # Extract values from raw data
        id_val = safe_convert_to_string(paper_raw.get("id", ""))
        title = safe_convert_to_string(paper_raw.get("title", ""))
        authors = extract_authors(paper_raw.get("authors", []))
        abstract = safe_convert_to_string(paper_raw.get("abstract", ""))
        doi = safe_convert_to_string(paper_raw.get("doi", ""))
        published_date = safe_convert_to_string(
            paper_raw.get("publishedDate", "")
        )
        updated_date = safe_convert_to_string(
            paper_raw.get("submittedDate", "")
        )
        categories = extract_categories(paper_raw.get("categories", []))
        # Extract license name
        license_info = paper_raw.get("license", {})
        license_val = (
            license_info.get("name", "")
            if isinstance(license_info, dict)
            else safe_convert_to_string(license_info)
        )
        # Extract PDF URL
        asset_info = paper_raw.get("asset", {})
        pdf_url = ""
        if isinstance(asset_info, dict) and "original" in asset_info:
            pdf_url = asset_info["original"].get("url", "")
        # Extract metrics
        metrics = paper_raw.get("metrics", [])
        views_count = extract_metric_value(metrics, "Abstract Views")
        citation_count = extract_metric_value(metrics, "Citations")
        read_count = extract_metric_value(
            metrics, "Content Downloads"
        )  # Using downloads as read count
        # Extract keywords
        keywords = safe_convert_to_string(paper_raw.get("keywords", []))
        text_paper = ""
        text_si = ""
        # Create new row
        new_row = {
            "id": id_val,
            "title": title,
            "authors": authors,
            "abstract": abstract,
            "doi": doi,
            "published_date": published_date,
            "updated_date": updated_date,
            "categories": categories,
            "license": license_val,
            "pdf_url": pdf_url,
            "views_count": views_count,
            "read_count": read_count,
            "citation_count": citation_count,
            "keywords": keywords,
            "text_paper": text_paper,
            "text_si": text_si,
        }
        # Add row to DataFrame
        df.loc[len(df)] = new_row
        print(f"Added: {title[:50]}...")
    except Exception as e:
        print(f"Error processing paper: {e}")
        continue
# Save to CSV
df.to_csv(directory + "arxiv_papers.csv", index=False)
print(f"\nSaved {len(df)} papers to {directory}arxiv_papers.csv")
# Display the DataFrame
df

# The dataset is loaded to huggingface via the CLI

after running this codeblock, run `huggingface-cli upload <name-of-dataset> ./data/arxiv_papers.csv --repo-type=dataset`. To update the repository, set the filter of the data to be after the parsing date.

## Loading raw dataset from Huggingface

In [None]:
from datasets import load_dataset

dataset = load_dataset("magdaroni/chemrxiv-dev", split="train")

dataset