In [1]:
import os
import re
import json
import time
import pandas as pd
import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig

# BILLING: https://ai.google.dev/pricing

DATASET = 'data/abstracts-500'
GET_DOI = False

MODEL_VERSION = "gemini-2.0-flash"
PROMPT = 'v1_prompt'

instruction_file = f'./{PROMPT}.txt'
dataset_file = f'../{DATASET}.xlsx'
output_file = f'./results/gemini_2025/{DATASET}__{PROMPT}__output.xlsx'

In [2]:
PROJECT_ID = "<GCP-PROJECT-ID-HERE>"
vertexai.init(project=PROJECT_ID, location="europe-west1")

df = pd.read_excel(dataset_file)

with open(instruction_file, 'r') as file:
    instructions = file.read()

# Print the instructions that will be fed to Gemini
print(instructions)

model = GenerativeModel(model_name=MODEL_VERSION, system_instruction=instructions)

I am screening papers for a systematic literature review.
The topic of the systematic review is assessing links between urban greenspaces and mental health in low- and middle-income countries. The general urban population of upper/lower-middle-income and low-income countries, as defined by OECD’s Development Assistance Committee (DAC) is included. Studies from high-income countries are excluded.
The study should focus exclusively on this topic.

Decide if the following article should be included or excluded from the systematic review. I give the title and abstract of the article as input.

Please respond with a plain JSON, without any formatting or backticks, that adheres to the following format:
{
  "verdict": "<your verdict here, either 'include' or 'exclude'>",
  "explanation": "<detailed explanation to justify your verdict here>",
  "confidence": "<confidence level of your decision here>"
}

Be lenient. I prefer including papers by mistake rather than excluding them by mistake.


In [3]:
def classify_paper(paper):

    response = model.generate_content(
        paper,
        generation_config = GenerationConfig(
            temperature=0.0
        )
    )
    response_content = response.text
    
    # Clean markdown code block if present
    cleaned_response = re.sub(r"^```(?:json)?\n|```$", "", response_content.strip(), flags=re.IGNORECASE | re.MULTILINE)

    return cleaned_response

In [4]:
def update_output_file(output_file, new_row):

    # Define the columns for the DataFrame
    columns = ["ID", "Include/exclude", "Gemini verdict", "Gemini explanation", "Gemini confidence", "Title", "Authors", "Abstract"]
    
    # Check if the file exists
    if not os.path.exists(output_file):
        # Create a new DataFrame with the headers and save it to a new Excel file
        df = pd.DataFrame(columns=columns)

        # Append the new row to the DataFrame using concat
        df = pd.concat([df, new_row], ignore_index=True)
        
        df.to_excel(output_file, index=False)
        print(f"Created new file: {output_file} and added the first row.")
    else:
        # Load the existing file into a DataFrame
        df = pd.read_excel(output_file)

        # Append the new row to the DataFrame using concat
        df = pd.concat([df, new_row], ignore_index=True)
    
        # Save the updated DataFrame back to the Excel file
        df.to_excel(output_file, index=False)
        # print("New row added to the existing file.")

In [5]:
true_positives = 0
true_negatives = 0
false_positives = 0
false_negatives = 0

# Record the start time
start_time = time.time()
total_rows_number = len(df)

# Iterate over the rows and print the content
for index, row in df.iterrows():

    r = row.to_dict()
    paper = f"Authors: {r['AUTHOR']}\n\nTitle: {r['TITLE']}\n\nAbstract: {r['ABSTRACT']}"
    # print(f"Paper ID {r['ID']}: {r['Title']}\n")
    print(f"Screening paper {index + 1}/{total_rows_number}")

    # Attempt to classify the paper and handle potential issues
    try:
        response_content = classify_paper(paper)

        # print(f"Raw response for paper ID {r['ID']}:\n{response_content!r}")

        # Attempt to parse the JSON response
        answer_dict = json.loads(response_content)
        
    except json.JSONDecodeError as e:
        # print(f"Failed to decode JSON response for paper ID {r['ID']}: {e}")
        # continue  # Skip this row and move to the next one if JSON parsing fails
        raise RuntimeError(f"Failed to decode JSON response for paper ID {r['ID']}: {e}")
    
    new_row = pd.DataFrame([{
        "ID": r['ID'],
        "Include/exclude": r['Include/exclude'],
        "Gemini verdict": answer_dict['verdict'],
        "Gemini explanation": answer_dict['explanation'],
        "Gemini confidence": answer_dict['confidence'],
        "Title": r['TITLE'],
        "Authors": r['AUTHOR'],
        "Abstract": r['ABSTRACT']
    }])

    update_output_file(output_file, new_row)

    if r['Include/exclude'] == answer_dict['verdict'] == "include":
        true_positives += 1
    elif r['Include/exclude'] == answer_dict['verdict'] == "exclude":
        true_negatives += 1
    elif r['Include/exclude'] == "include" and answer_dict['verdict'] == "exclude":
        false_negatives += 1
    elif r['Include/exclude'] == "exclude" and answer_dict['verdict'] == "include":
        false_positives += 1
        

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

print(f"\n\nThis paper screening took {elapsed_time:.1f} seconds")
print(f"The screening included {true_positives + true_negatives + false_positives + false_negatives} papers, out of which {true_positives + true_negatives} were correctly classified")
print(f"True positives: {true_positives}")
print(f"True negatives: {true_negatives}")
print(f"False positives: {false_positives}")
print(f"False negatives: {false_negatives}")

Screening paper 1/500
Created new file: ./max/results/gemini_2025/correct-abstracts-500__v1_balanced-prompt-without-examples__output.xlsx and added the first row.
Screening paper 2/500
Screening paper 3/500
Screening paper 4/500
Screening paper 5/500
Screening paper 6/500
Screening paper 7/500
Screening paper 8/500
Screening paper 9/500
Screening paper 10/500
Screening paper 11/500
Screening paper 12/500
Screening paper 13/500
Screening paper 14/500
Screening paper 15/500
Screening paper 16/500
Screening paper 17/500
Screening paper 18/500
Screening paper 19/500
Screening paper 20/500
Screening paper 21/500
Screening paper 22/500
Screening paper 23/500
Screening paper 24/500
Screening paper 25/500
Screening paper 26/500
Screening paper 27/500
Screening paper 28/500
Screening paper 29/500
Screening paper 30/500
Screening paper 31/500
Screening paper 32/500
Screening paper 33/500
Screening paper 34/500
Screening paper 35/500
Screening paper 36/500
Screening paper 37/500
Screening paper 3