In [2]:
%reload_ext autoreload
%autoreload 2

import json
from datetime import datetime
from itertools import batched

import rispy
from dotenv import load_dotenv
from rich.console import Console
from rich.table import Table
from tqdm.notebook import tqdm

import bigger_picker.config as config
import bigger_picker.utils as utils
from bigger_picker.airtable import AirtableManager
from bigger_picker.asana import AsanaManager
from bigger_picker.batchtracker import BatchTracker
from bigger_picker.datamodels import Article, ArticleLLMExtract
from bigger_picker.integration import IntegrationManager
from bigger_picker.openai import OpenAIManager
from bigger_picker.rayyan import RayyanManager

load_dotenv()

True

In [3]:
airtable = AirtableManager()
asana = AsanaManager()
openai = OpenAIManager()
rayyan = RayyanManager("../rayyan_tokens.json")
batchtracker = BatchTracker()
console = Console()
integration = IntegrationManager(
    asana_manager=asana,
    airtable_manager=airtable,
    openai_manager=openai,
    rayyan_manager=rayyan,
    batch_tracker=batchtracker,
    console=console,
    debug=True,
)

In [4]:
with open("Scopus SDQ Updated.ris") as risfile:
    sdq_entries = rispy.load(risfile)

In [5]:
sdq_dois = dict()
for entry in sdq_entries:
    if "doi" in entry:
        cleaned_doi = entry["doi"].removeprefix("https://doi.org/").lower().strip()
        sdq_dois[cleaned_doi] = 0

In [6]:
n_articles = 31639
batch_size = 1000
batches = batched(range(0, n_articles), batch_size)
total_batches = (n_articles + batch_size - 1) // batch_size

with tqdm(total=total_batches, desc="Overall Progress") as overall_pbar:
    for batch_idx, batch in enumerate(batches):
        results_params = {"start": batch[0], "length": len(batch)}
        articles = rayyan.review.results(rayyan.review_id, results_params)
        articles_data = articles["data"]

        with tqdm(
            total=len(articles_data),
            desc=f"Batch {batch_idx + 1}/{total_batches}",
            leave=False,
        ) as batch_pbar:
            for article in articles_data:
                article_doi = article["doi"]
                try:
                    article_doi = (
                        article_doi.removeprefix("https://doi.org/").lower().strip()
                    )
                except AttributeError:
                    # No DOI so skip
                    batch_pbar.update(1)
                    continue

                plan = {}

                if article_doi in sdq_dois:
                    plan["SDQ"] = 1

                # Check if we can avoid processing
                existing_labels = article["customizations"].get("labels", {})

                if existing_labels.get("SDQ") == plan.get("SDQ"):
                    # Label already attached or not needed
                    sdq_dois[article_doi] = 1
                    batch_pbar.update(1)
                    continue

                # If we haven't set the plan, we default to removing existing labels
                plan.setdefault("SDQ", -1)

                try:
                    rayyan.review.customize(rayyan.review_id, article["id"], plan)
                    sdq_dois[article_doi] = 1
                except Exception as e:
                    print(f"Error updating {article['id']}: {e}")

                batch_pbar.update(1)

        overall_pbar.update(1)

Overall Progress:   0%|          | 0/32 [00:00<?, ?it/s]

Batch 1/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 2/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 3/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 4/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 5/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 6/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 7/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 8/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 9/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 10/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 11/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 12/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 13/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 14/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 15/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 16/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 17/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 18/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 19/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 20/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 21/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 22/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 23/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 24/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 25/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 26/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 27/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 28/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 29/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 30/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 31/32:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 32/32:   0%|          | 0/639 [00:00<?, ?it/s]

In [9]:
unmatched_dois = [k for k, v in sdq_dois.items() if v == 0]
len(unmatched_dois)

0

In [None]:
results_params = {"extra[user_labels][]": "SDQ"}
rayyan_sdq = rayyan.review.results(rayyan.review_id, results_params)
rayyan_sdq_data = rayyan_sdq["data"]
rayyan_sdq_ids = [item["id"] for item in rayyan_sdq_data]
rayyan_sdq_ids = set(rayyan_sdq_ids)

In [None]:
included_count = 0
for article in rayyan_sdq_data:
    article_labels = article["customizations"].get("labels", {})
    if article_labels.get("Included: AI Extracted"):
        included_count += 1

print(included_count)


571


In [38]:
articles_table = integration.airtable.tables["Articles"]
at_articles = articles_table.all()

In [39]:
len(at_articles)

1432

In [None]:
count_not = 0
count_in = 0
for record in tqdm(at_articles):
    if record["fields"]["Rayyan ID"] in rayyan_sdq_ids:
        # This article is in the SDQ set
        if "SDQ" not in record["fields"]["Search"]:
            count_not += 1
        else:
            count_in += 1
print(count_not, count_in)

  0%|          | 0/1432 [00:00<?, ?it/s]

0 568
