In [None]:
from itertools import batched

import rispy
from dotenv import load_dotenv
from tqdm.notebook import tqdm

from bigger_picker.airtable import AirtableManager
from bigger_picker.asana import AsanaManager
from bigger_picker.batchtracker import BatchTracker
from bigger_picker.integration import IntegrationManager
from bigger_picker.openai import OpenAIManager
from bigger_picker.rayyan import RayyanManager

load_dotenv()

True

In [2]:
airtable = AirtableManager()
asana = AsanaManager()
openai = OpenAIManager()
rayyan = RayyanManager("../rayyan_tokens.json")
batchtracker = BatchTracker()
integration = IntegrationManager(
    asana_manager=asana,
    airtable_manager=airtable,
    openai_manager=openai,
    rayyan_manager=rayyan,
    batch_tracker=batchtracker,
    debug=True,
)

In [3]:
with open("Scopus Mental Health.ris") as risfile:
    mh_entries = rispy.load(risfile)

with open("Scopus Mental Health_1.ris") as risfile:
    mh_entries += rispy.load(risfile)

with open("Scopus Mental Health Combined.ris", "w") as risfile:
    rispy.dump(mh_entries, risfile)

In [24]:
ris_files = {
    "Academic achievement": "Scopus Academic.ris",
    "Cognition": "Scopus Cognition.ris",
    "Mental health": "Scopus Mental Health Combined.ris",
    "Wellbeing": "Scopus Wellbeing.ris",
}

doi_dict = {}
no_match = {}

for label, filename in ris_files.items():
    with open(filename) as risfile:
        entries = rispy.load(risfile)

    for entry in entries:
        article_doi = entry.get("doi", None)
        if not article_doi:
            no_match.setdefault(label, []).append(entry)
            continue
        article_doi = article_doi.removeprefix("https://doi.org/").lower().strip()
        doi_dict.setdefault(article_doi, set()).add(label)

print("DOIs with matches:", len(doi_dict))

DOIs with matches: 44512


In [12]:
for key, entries in no_match.items():
    print(f"{key}: {len(entries)} entries without DOI")

Academic achievement: 1267 entries without DOI
Cognition: 546 entries without DOI
Mental health: 655 entries without DOI
Wellbeing: 482 entries without DOI


In [13]:
results_params = {"start": 0, "length": 10}
results = integration.rayyan.review.results(rayyan.review_id, results_params)
print(f"Total records in Rayyan review: {results['recordsTotal']}")

Total records in Rayyan review: 31639


In [14]:
airtable_articles = integration.airtable.tables["Articles"].all()
articles_dict = {}
for rayyan_article in airtable_articles:
    articles_dict[rayyan_article["fields"]["Rayyan ID"]] = rayyan_article

In [15]:
assert integration.rayyan and integration.airtable
n_articles = int(results["recordsTotal"])
batch_size = 1000
batches = batched(range(0, n_articles), batch_size)
total_batches = (n_articles + batch_size - 1) // batch_size

failures = []

with tqdm(total=total_batches, desc="Overall Progress") as overall_pbar:
    for batch_idx, batch in enumerate(batches):
        results_params = {"start": batch[0], "length": len(batch)}
        articles = rayyan.review.results(rayyan.review_id, results_params)
        articles_data = articles["data"]

        with tqdm(
            total=len(articles_data),
            desc=f"Batch {batch_idx + 1}/{total_batches}",
            leave=False,
        ) as batch_pbar:
            for rayyan_article in articles_data:
                article_doi = rayyan_article["doi"]
                try:
                    article_doi = (
                        article_doi.removeprefix("https://doi.org/").lower().strip()
                    )
                except AttributeError:
                    # No DOI so skip
                    batch_pbar.update(1)
                    continue

                plan = {}

                if article_doi in doi_dict:
                    for label in doi_dict[article_doi]:
                        plan[label] = 1

                    try:
                        integration.rayyan.update_article_labels(
                            rayyan_article["id"], plan
                        )

                    except Exception as e:
                        print(f"Error updating article {rayyan_article['id']}: {e}")
                        failures.append(rayyan_article["id"])

                    if rayyan_article["id"] in articles_dict:
                        payload = {
                            "Search": articles_dict[rayyan_article["id"]]["fields"].get(
                                "Search", []
                            )
                            + list(doi_dict[article_doi])
                        }
                        integration.airtable.update_record(
                            "Articles",
                            articles_dict[rayyan_article["id"]]["id"],
                            payload,
                        )
                    del doi_dict[article_doi]

                batch_pbar.update(1)

        overall_pbar.update(1)

Overall Progress:   0%|          | 0/32 [00:00<?, ?it/s]

Batch 1/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 2/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 3/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 4/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 5/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 6/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 7/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 8/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 9/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 10/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 11/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 12/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 13/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 14/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 15/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 16/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 17/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 18/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 19/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 20/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 21/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 22/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 23/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 24/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 25/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 26/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 27/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 28/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 29/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 30/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 31/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 32/32:   0%|          | 0/639 [00:00<?, ?it/s]

In [None]:
# Iterate through results, check if DOI is already in Rayyan
# If so, update the record with new labels, and update Airtable (and Asana?)
# If not, add to no_match based on which labels it has (must sort labels first)
# Create new RIS files for each combination of labels for manual upload

In [18]:
full_dict = {}

for label, filename in ris_files.items():
    with open(filename) as risfile:
        entries = rispy.load(risfile)

    for entry in entries:
        article_doi = entry.get("doi", None)
        if not article_doi:
            full_dict.setdefault(label, []).append(entry)
            continue
        article_doi = article_doi.removeprefix("https://doi.org/").lower().strip()
        if article_doi in doi_dict:
            labels = sorted(list(doi_dict[article_doi]))
            key = "_".join(labels)
            full_dict.setdefault(key, []).append(entry)

In [21]:
for key, entries in full_dict.items():
    with open(f"{key} unmatched.ris", "w") as risfile:
        rispy.dump(entries, risfile)