In [None]:
import csv
from itertools import batched

from dotenv import load_dotenv
from tqdm.notebook import tqdm

from bigger_picker.rayyan import RayyanManager

load_dotenv()

True

In [2]:
def get_csv_ratings(file_path):
    dois = set()
    with open(file_path) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            dois.add(row["doi"])

    dois = {doi.removeprefix("https://doi.org/").lower().strip() for doi in dois}
    return dois

In [3]:
included_ft_dois = get_csv_ratings("Sanders - included.csv")
excluded_ft_dois = get_csv_ratings("Sanders - excluded-ft.csv")
excluded_ab_dois = get_csv_ratings("Sanders - excluded-ab.csv")
failed_ft_dois = get_csv_ratings("Sanders - failed-ft.csv")
missing_ft_dois = get_csv_ratings("Sanders - missing-ft.csv")

# Combine the ones that indicate inclusion at the abstract level
included_ab_dois = (
    included_ft_dois.union(excluded_ft_dois)
    .union(failed_ft_dois)
    .union(missing_ft_dois)
)
print(f"Included at abstract level: {len(included_ab_dois)}")
print(f"Included at full text level: {len(included_ft_dois)}")

Included at abstract level: 5520
Included at full text level: 1983


In [4]:
rayyan = RayyanManager("../rayyan_tokens.json")

In [5]:
n_articles = 29610
batch_size = 1000
batches = batched(range(0, n_articles), batch_size)
total_batches = (n_articles + batch_size - 1) // batch_size

with tqdm(total=total_batches, desc="Overall Progress") as overall_pbar:
    for batch_idx, batch in enumerate(batches):
        results_params = {"start": batch[0], "length": len(batch)}
        articles = rayyan.review.results(rayyan.review_id, results_params)
        articles_data = articles["data"]

        with tqdm(
            total=len(articles_data),
            desc=f"Batch {batch_idx + 1}/{total_batches}",
            leave=False,
        ) as batch_pbar:
            for article in articles_data:
                article_doi = article["doi"]
                try:
                    article_doi = (
                        article_doi.removeprefix("https://doi.org/").lower().strip()
                    )
                except AttributeError:
                    # No DOI so skip
                    batch_pbar.update(1)
                    continue

                plan = {}

                if article_doi in included_ft_dois:
                    plan["OttoSR: Include Full-Text"] = 1
                    plan["OttoSR: Include Abstract"] = 1
                if article_doi in included_ab_dois:
                    plan["OttoSR: Include Abstract"] = 1
                if article_doi in excluded_ft_dois:
                    plan["OttoSR: Exclude Full-Text"] = 1

                # Check if we can avoid processing
                existing_labels = article["customizations"].get("labels", {})

                if (
                    existing_labels.get("OttoSR: Include") is None
                    and existing_labels.get("OttoSR: Exclude Full-Text") is None
                    and plan.get("OttoSR: Include Abstract") is None
                    and plan.get("OttoSR: Exclude Full-Text") is None
                ):
                    # There is no existing label or plan for this article
                    batch_pbar.update(1)
                    continue

                # If we haven't set the plan, we default to removing existing labels
                plan.setdefault("OttoSR: Include", -1)
                plan.setdefault("OttoSR: Exclude Full-Text", -1)

                rayyan.review.customize(rayyan.review_id, article["id"], plan)
                included_ab_dois.discard(article_doi)
                included_ft_dois.discard(article_doi)
                excluded_ft_dois.discard(article_doi)

                batch_pbar.update(1)

        overall_pbar.update(1)

Overall Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Batch 1/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 2/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 3/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 4/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 5/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 6/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 7/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 8/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 9/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 10/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 11/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 12/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 13/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 14/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 15/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 16/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 17/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 18/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 19/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 20/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 21/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 22/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 23/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 24/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 25/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 26/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 27/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 28/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 29/30:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 30/30:   0%|          | 0/610 [00:00<?, ?it/s]

In [7]:
print("Included abstract DOIs left:", len(included_ab_dois))
print("Included full-text DOIs left:", len(included_ft_dois))
print("Excluded DOIs left:", len(excluded_ft_dois))


Included abstract DOIs left: 77
Included full-text DOIs left: 19
Excluded DOIs left: 18


In [None]:
def get_leftover_dois(csv_paths: list[str], leftover_dois: set):
    leftover_rows = []

    for csv_path in csv_paths:
        with open(csv_path) as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                if (
                    row["doi"].removeprefix("https://doi.org/").lower().strip()
                    in leftover_dois
                ):
                    leftover_rows.append(row)

    return leftover_rows


leftover_ab_dois = get_leftover_dois(
    [
        "Sanders-included.csv",
        "Sanders - excluded-ft.csv",
        "Sanders - failed-ft.csv",
        "Sanders - missing-ft.csv",
    ],
    included_ab_dois,
)
assert len(leftover_ab_dois) == len(included_ab_dois)

leftover_ft_dois = get_leftover_dois(
    ["Sanders-included.csv"],
    included_ft_dois,
)
assert len(leftover_ft_dois) == len(included_ft_dois)

leftover_excludedft_dois = get_leftover_dois(
    ["Sanders - excluded-ft.csv"],
    excluded_ft_dois,
)
assert len(leftover_excludedft_dois) == len(excluded_ft_dois)


In [None]:
for article in leftover_ab_dois:
    print(
        f"""
Article DOI: {article["doi"]}
Title: {article["title"]}
Authors: {article["authors"]}
Year: {article["date"]}
"""
    )

In [None]:
for article in leftover_ft_dois:
    print(
        f"""
Article DOI: {article["doi"]}
Title: {article["title"]}
Authors: {article["authors"]}
Year: {article["date"]}
"""
    )

In [None]:
for article in leftover_excludedft_dois:
    print(
        f"""
Article DOI: {article["doi"]}
Title: {article["title"]}
Authors: {article["authors"]}
Year: {article["date"]}
"""
    )