In [2]:
import os
import json
import time
import pandas as pd
import anthropic
from anthropic import APIError

# BILLING: https://www.anthropic.com/pricing#anthropic-api

DATASET = 'data/abstracts-500'
GET_DOI = False


MODEL_VERSION = 'claude-3-5-sonnet-20240620'
# MODEL_VERSION = 'claude-3-7-sonnet-20250219'
PROMPT = 'v1_prompt'

instruction_file = f'./{PROMPT}.txt'
dataset_file = f'../{DATASET}.xlsx'
output_file = f'./results/claude_2025/{DATASET}_1_output.xlsx'

In [3]:
client = anthropic.Anthropic(api_key="<API-KEY-HERE>")

df = pd.read_excel(dataset_file)

with open(instruction_file, 'r') as file:
    instructions = file.read()

# Print the instructions that will be fed to Claude
print(instructions)

I am screening papers for a systematic literature review.
The topic of the systematic review is assessing links between urban greenspaces and mental health in low- and middle-income countries. The general urban population of upper/lower-middle-income and low-income countries, as defined by OECD’s Development Assistance Committee (DAC) is included. Studies from high-income countries are excluded.
The study should focus exclusively on this topic.

Decide if the following article should be included or excluded from the systematic review. I give the title and abstract of the article as input.

Please respond with a plain JSON, without any formatting or backticks, that adheres to the following format:
{
  "verdict": "<your verdict here, either 'include' or 'exclude'>",
  "explanation": "<detailed explanation to justify your verdict here>",
  "confidence": "<confidence level of your decision here>"
}

Be lenient. I prefer including papers by mistake rather than excluding them by mistake.


In [4]:
def classify_paper(paper: str, retries=5, base_delay=2):
    for attempt in range(retries):
        try:
            message = client.messages.create(
                model=MODEL_VERSION,
                max_tokens=1024,
                temperature=0,
                system=instructions,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": paper
                            }
                        ]
                    }
                ]
            )
            return message.content[0].text
        except APIError as e:
            if "overloaded" in str(e).lower() and attempt < retries - 1:
                delay = base_delay * (2 ** attempt)
                print(f"Server overloaded. Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                raise e

In [5]:
def update_output_file(output_file, new_row):

    # Define the columns for the DataFrame
    columns = ["ID", "Include/exclude", "Claude verdict", "Claude explanation", "Claude confidence", "Title", "Authors", "Abstract"]
    
    # Check if the file exists
    if not os.path.exists(output_file):
        # Create a new DataFrame with the headers and save it to a new Excel file
        df = pd.DataFrame(columns=columns)

        # Append the new row to the DataFrame using concat
        df = pd.concat([df, new_row], ignore_index=True)
        
        df.to_excel(output_file, index=False)
        print(f"Created new file: {output_file} and added the first row.")
    else:
        # Load the existing file into a DataFrame
        df = pd.read_excel(output_file)

        # Append the new row to the DataFrame using concat
        df = pd.concat([df, new_row], ignore_index=True)
    
        # Save the updated DataFrame back to the Excel file
        df.to_excel(output_file, index=False)
        # print("New row added to the existing file.")


In [6]:
true_positives = 0
true_negatives = 0
false_positives = 0
false_negatives = 0

# Record the start time
start_time = time.time()
total_rows_number = len(df)

# Iterate over the rows and print the content
for index, row in df.iterrows():

    r = row.to_dict()
    paper = f"Authors: {r['AUTHOR']}\n\nTitle: {r['TITLE']}\n\nAbstract: {r['ABSTRACT']}"
    # print(f"Paper ID {r['ID']}: {r['Title']}\n")
    print(f"Screening paper {index + 1}/{total_rows_number}")


    # Attempt to classify the paper and handle potential issues
    try:
        response_content = classify_paper(paper)
        # Attempt to parse the JSON response
        answer_dict = json.loads(response_content)
        
    except json.JSONDecodeError as e:
        # print(f"Failed to decode JSON response for paper ID {r['ID']}: {e}")
        # continue  # Skip this row and move to the next one if JSON parsing fails
        # raise RuntimeError(f"Failed to decode JSON response for paper ID {r['ID']}: {e}")

        new_row = pd.DataFrame([{
            "ID": r['ID'],
            "Include/exclude": r['Include/exclude'],
            "Claude verdict": "",
            "Claude explanation": "",
            "Claude confidence": "",
            "Title": r['TITLE'],
            "Authors": r['AUTHOR'],
            "Abstract": r['ABSTRACT']
        }])

        update_output_file(output_file, new_row)

        print("ERROR Failed to decode JSON response for paper ID {r['ID']}: {e}")

        continue
    
    new_row = pd.DataFrame([{
        "ID": r['ID'],
        "Include/exclude": r['Include/exclude'],
        "Claude verdict": answer_dict['verdict'],
        "Claude explanation": answer_dict['explanation'],
        "Claude confidence": answer_dict['confidence'],
        "Title": r['TITLE'],
        "Authors": r['AUTHOR'],
        "Abstract": r['ABSTRACT']
    }])

    update_output_file(output_file, new_row)

    if r['Include/exclude'] == answer_dict['verdict'] == "include":
        true_positives += 1
    elif r['Include/exclude'] == answer_dict['verdict'] == "exclude":
        true_negatives += 1
    elif r['Include/exclude'] == "include" and answer_dict['verdict'] == "exclude":
        false_negatives += 1
    elif r['Include/exclude'] == "exclude" and answer_dict['verdict'] == "include":
        false_positives += 1
        

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

print(f"\n\nThis paper screening took {elapsed_time:.1f} seconds")
print(f"The screening included {true_positives + true_negatives + false_positives + false_negatives} papers, out of which {true_positives + true_negatives} were correctly classified")
print(f"True positives: {true_positives}")
print(f"True negatives: {true_negatives}")
print(f"False positives: {false_positives}")
print(f"False negatives: {false_negatives}")


Screening paper 1/1
Created new file: ./max/results/claude_2025/correct-abstracts-1_1_output.xlsx and added the first row.
ERROR Failed to decode JSON response for paper ID {r['ID']}: {e}


This paper screening took 5.8 seconds
The screening included 0 papers, out of which 0 were correctly classified
True positives: 0
True negatives: 0
False positives: 0
False negatives: 0
