# Creating target categories list

In [25]:
import json

with open('config/scraper_terms.json', 'r') as file:
    taxonomy = json.load(file)

In [26]:
taxonomy.keys()

dict_keys(['AI_Safety_Research_Mappings', 'technicalAiGovernance'])

In [27]:
taxonomy['AI_Safety_Research_Mappings'].keys()

dict_keys(['Mechanistic_Interpretability', 'Scalable_Oversight', 'Adversarial_Robustness', 'Agent_Foundations', 'Alignment_Theory', 'Evaluations_Dangerous_Capabilities', 'Value_Learning_Alignment', 'Cooperative_AI', 'AI_Governance_Policy', 'Compute_Governance'])

In [28]:
taxonomy['technicalAiGovernance'].keys()

dict_keys(['assessment', 'access', 'verification', 'security', 'operationalization', 'ecosystemMonitoring'])

In [29]:
paper_topics = list(taxonomy['AI_Safety_Research_Mappings'].keys())
paper_topics.append('Technical AI Governance')
paper_topics

['Mechanistic_Interpretability',
 'Scalable_Oversight',
 'Adversarial_Robustness',
 'Agent_Foundations',
 'Alignment_Theory',
 'Evaluations_Dangerous_Capabilities',
 'Value_Learning_Alignment',
 'Cooperative_AI',
 'AI_Governance_Policy',
 'Compute_Governance',
 'Technical AI Governance']

# Loading dataset

In [30]:
import pandas as pd

academic_data = pd.read_csv('data/ai_safety_papers.csv')

academic_data.head()

Unnamed: 0,paper_id,title,authors,year,abstract,url,pdf_url,scholar_url,venue,keywords,citations,title_hash,doi,arxiv_id,s2_fields,created_at,updated_at
0,210b0a3d76e93079cc51b03c4115fde545eea966,GPQA: A Graduate-Level Google-Proof Q&A Benchmark,"David Rein, Betty Li Hou, Asa Cooper Stickland...",2023,"We present GPQA, a challenging dataset of 448 ...",https://www.semanticscholar.org/paper/210b0a3d...,https://arxiv.org/pdf/2311.12022.pdf,https://www.semanticscholar.org/paper/210b0a3d...,arXiv.org,"Computer Science, Biology, Physics, Computer S...",1065,d2390e0e97b7199093a42b27a5cf32bc,,2311.12022,"[{'source': 'external', 'category': 'Computer ...",2025-09-30 00:46:58.124675+00:00,2025-09-30 00:46:58.124675+00:00
1,6edd112383ad494f5f2eba72b6f4ffae122ce61f,Interpretability in the Wild: a Circuit for In...,"Kevin Wang, Alexandre Variengien, Arthur Conmy...",2022,Research in mechanistic interpretability seeks...,https://www.semanticscholar.org/paper/6edd1123...,https://arxiv.org/pdf/2211.00593,https://www.semanticscholar.org/paper/6edd1123...,International Conference on Learning Represent...,"Computer Science, Computer Science",644,1ff47a5be9a68e64e23ad2359d220370,10.48550/arXiv.2211.00593,2211.00593,"[{'source': 'external', 'category': 'Computer ...",2025-09-29 20:01:28.298552+00:00,2025-09-29 20:02:03.480569+00:00
2,0893549771094fac547432cb4f84e9605c911a86,The imperative for regulatory oversight of lar...,"B. Meskó, E. Topol",2023,The rapid advancements in artificial intellige...,https://www.semanticscholar.org/paper/08935497...,https://www.nature.com/articles/s41746-023-008...,https://www.semanticscholar.org/paper/08935497...,npj Digit. Medicine,"Computer Science, Medicine, Medicine, Computer...",627,920cc7dbbd6a0bb608e11b65097d69ef,10.1038/s41746-023-00873-0,,"[{'source': 'external', 'category': 'Computer ...",2025-09-30 00:46:58.124675+00:00,2025-09-30 00:46:58.124675+00:00
3,f680d47a51a0e470fcb228bf0110c026535ead1b,Progress measures for grokking via mechanistic...,"Neel Nanda, Lawrence Chan, Tom Lieberum, Jess ...",2023,Neural networks often exhibit emergent behavio...,https://www.semanticscholar.org/paper/f680d47a...,http://arxiv.org/pdf/2301.05217,https://www.semanticscholar.org/paper/f680d47a...,International Conference on Learning Represent...,"Computer Science, Computer Science",517,953089e9556a8e0b37293683f8ff8807,10.48550/arXiv.2301.05217,2301.05217,"[{'source': 'external', 'category': 'Computer ...",2025-09-29 20:01:13.784216+00:00,2025-09-29 20:01:43.521903+00:00
4,eefbd8b384a58f464827b19e30a6920ba976def9,Towards Automated Circuit Discovery for Mechan...,"Arthur Conmy, Augustine N. Mavor-Parker, Aengu...",2023,"Through considerable effort and intuition, sev...",https://www.semanticscholar.org/paper/eefbd8b3...,https://arxiv.org/pdf/2304.14997,https://www.semanticscholar.org/paper/eefbd8b3...,Neural Information Processing Systems,"Computer Science, Computer Science, Engineering",356,a97a69c6234d51eeafeb50c9077b71ba,10.48550/arXiv.2304.14997,2304.14997,"[{'source': 'external', 'category': 'Computer ...",2025-09-29 20:01:14.252982+00:00,2025-09-29 20:01:44.864490+00:00


In [31]:
academic_data.shape

(759, 17)

# Categorization function

In [32]:
#!pip install -q -U google-genai

In [33]:
paper_topics.append('None')

In [34]:
print(paper_topics)

['Mechanistic_Interpretability', 'Scalable_Oversight', 'Adversarial_Robustness', 'Agent_Foundations', 'Alignment_Theory', 'Evaluations_Dangerous_Capabilities', 'Value_Learning_Alignment', 'Cooperative_AI', 'AI_Governance_Policy', 'Compute_Governance', 'Technical AI Governance', 'None']


In [35]:
from pydantic import BaseModel, Field
from typing import Literal

class PaperCategory(BaseModel):
    category: Literal[*paper_topics]= Field(description='The best category fit for AI safety. If no category matches, then None is used.')

In [36]:
PaperCategory.model_json_schema()

{'properties': {'category': {'description': 'The best category fit for AI safety. If no category matches, then None is used.',
   'enum': ['Mechanistic_Interpretability',
    'Scalable_Oversight',
    'Adversarial_Robustness',
    'Agent_Foundations',
    'Alignment_Theory',
    'Evaluations_Dangerous_Capabilities',
    'Value_Learning_Alignment',
    'Cooperative_AI',
    'AI_Governance_Policy',
    'Compute_Governance',
    'Technical AI Governance',
    'None'],
   'title': 'Category',
   'type': 'string'}},
 'required': ['category'],
 'title': 'PaperCategory',
 'type': 'object'}

In [1]:
from google import genai
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
client = genai.Client()

In [39]:
def categorizar_paper(title:str, authors:str, abstract:str, keywords:str) -> str:
    prompt = f"""
    You should assign a single category to the paper given the informantion provided.
    Our goal is to categorize papers related to AI Safety.
    If the paper is not related to AI Safety, asign 'None' to its category.

    # Paper Info

    * title: {title}
    * authors: {authors}
    * abstract: {abstract}
    * keywords: {keywords}

    Categories available: {paper_topics}

    Provide the response as JSON following this Pydantic schema: {PaperCategory.model_json_schema()}

    """
    
    response = client.models.generate_content(
        model = 'gemini-2.5-flash',
        contents = prompt,
            config={
            "response_mime_type": "application/json",
            "response_schema": PaperCategory,
        },
    )

    return response.parsed.category
    

In [40]:
r = categorizar_paper(
    'GPQA: A Graduate-Level Google-Proof Q&A Benchmark',
    'David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, Samuel R. Bowman',
    'We present GPQA, a challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. We ensure that the questions are high-quality and extremely difficult: experts who have or are pursuing PhDs in the corresponding domains reach 65% accuracy (74% when discounting clear mistakes the experts identified in retrospect), while highly skilled non-expert validators only reach 34% accuracy, despite spending on average over 30 minutes with unrestricted access to the web (i.e., the questions are"Google-proof"). The questions are also difficult for state-of-the-art AI systems, with our strongest GPT-4 based baseline achieving 39% accuracy. If we are to use future AI systems to help us answer very hard questions, for example, when developing new scientific knowledge, we need to develop scalable oversight methods that enable humans to supervise their outputs, which may be difficult even if the supervisors are themselves skilled and knowledgeable. The difficulty of GPQA both for skilled non-experts and frontier AI systems should enable realistic scalable oversight experiments, which we hope can help devise ways for human experts to reliably get truthful information from AI systems that surpass human capabilities.',
    'Computer Science, Biology, Physics, Computer Science, Chemistry'
)

In [41]:
academic_data_categorizada = academic_data.copy()

In [42]:
#academic_data_categorizada['AI Safety Category'] = academic_data.apply(
#    lambda x: categorizar_paper(x['title'], x['authors'], x['abstract'], x['keywords']),
#    axis = 1
#)

In [43]:
academic_data_categorizada

Unnamed: 0,paper_id,title,authors,year,abstract,url,pdf_url,scholar_url,venue,keywords,citations,title_hash,doi,arxiv_id,s2_fields,created_at,updated_at
0,210b0a3d76e93079cc51b03c4115fde545eea966,GPQA: A Graduate-Level Google-Proof Q&A Benchmark,"David Rein, Betty Li Hou, Asa Cooper Stickland...",2023,"We present GPQA, a challenging dataset of 448 ...",https://www.semanticscholar.org/paper/210b0a3d...,https://arxiv.org/pdf/2311.12022.pdf,https://www.semanticscholar.org/paper/210b0a3d...,arXiv.org,"Computer Science, Biology, Physics, Computer S...",1065,d2390e0e97b7199093a42b27a5cf32bc,,2311.12022,"[{'source': 'external', 'category': 'Computer ...",2025-09-30 00:46:58.124675+00:00,2025-09-30 00:46:58.124675+00:00
1,6edd112383ad494f5f2eba72b6f4ffae122ce61f,Interpretability in the Wild: a Circuit for In...,"Kevin Wang, Alexandre Variengien, Arthur Conmy...",2022,Research in mechanistic interpretability seeks...,https://www.semanticscholar.org/paper/6edd1123...,https://arxiv.org/pdf/2211.00593,https://www.semanticscholar.org/paper/6edd1123...,International Conference on Learning Represent...,"Computer Science, Computer Science",644,1ff47a5be9a68e64e23ad2359d220370,10.48550/arXiv.2211.00593,2211.00593,"[{'source': 'external', 'category': 'Computer ...",2025-09-29 20:01:28.298552+00:00,2025-09-29 20:02:03.480569+00:00
2,0893549771094fac547432cb4f84e9605c911a86,The imperative for regulatory oversight of lar...,"B. Meskó, E. Topol",2023,The rapid advancements in artificial intellige...,https://www.semanticscholar.org/paper/08935497...,https://www.nature.com/articles/s41746-023-008...,https://www.semanticscholar.org/paper/08935497...,npj Digit. Medicine,"Computer Science, Medicine, Medicine, Computer...",627,920cc7dbbd6a0bb608e11b65097d69ef,10.1038/s41746-023-00873-0,,"[{'source': 'external', 'category': 'Computer ...",2025-09-30 00:46:58.124675+00:00,2025-09-30 00:46:58.124675+00:00
3,f680d47a51a0e470fcb228bf0110c026535ead1b,Progress measures for grokking via mechanistic...,"Neel Nanda, Lawrence Chan, Tom Lieberum, Jess ...",2023,Neural networks often exhibit emergent behavio...,https://www.semanticscholar.org/paper/f680d47a...,http://arxiv.org/pdf/2301.05217,https://www.semanticscholar.org/paper/f680d47a...,International Conference on Learning Represent...,"Computer Science, Computer Science",517,953089e9556a8e0b37293683f8ff8807,10.48550/arXiv.2301.05217,2301.05217,"[{'source': 'external', 'category': 'Computer ...",2025-09-29 20:01:13.784216+00:00,2025-09-29 20:01:43.521903+00:00
4,eefbd8b384a58f464827b19e30a6920ba976def9,Towards Automated Circuit Discovery for Mechan...,"Arthur Conmy, Augustine N. Mavor-Parker, Aengu...",2023,"Through considerable effort and intuition, sev...",https://www.semanticscholar.org/paper/eefbd8b3...,https://arxiv.org/pdf/2304.14997,https://www.semanticscholar.org/paper/eefbd8b3...,Neural Information Processing Systems,"Computer Science, Computer Science, Engineering",356,a97a69c6234d51eeafeb50c9077b71ba,10.48550/arXiv.2304.14997,2304.14997,"[{'source': 'external', 'category': 'Computer ...",2025-09-29 20:01:14.252982+00:00,2025-09-29 20:01:44.864490+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
754,f1ecc468bc42de25ccd71dc84a6b7a8dafab6ed2,Mechanistic Interpretability of GPT-like Model...,Anurag Mishra,2025,Mechanistic interpretability research seeks to...,https://www.semanticscholar.org/paper/f1ecc468...,https://arxiv.org/pdf/2505.17073.pdf,https://www.semanticscholar.org/paper/f1ecc468...,arXiv.org,"Computer Science, Computer Science",0,3771d70ffd60f9c3f692d0e8f989f74d,10.48550/arXiv.2505.17073,2505.17073,"[{'source': 'external', 'category': 'Computer ...",2025-09-29 20:01:25.786419+00:00,2025-09-29 20:01:55.195754+00:00
755,663292eaef24c22c0692f1b4a9120d24662d7fc7,Causal Intervention Framework for Variational ...,Dip Roy,2025,Mechanistic interpretability of deep learning ...,https://www.semanticscholar.org/paper/663292ea...,https://arxiv.org/pdf/2505.03530.pdf,https://www.semanticscholar.org/paper/663292ea...,arXiv.org,"Computer Science, Computer Science",0,2f8d578153eefbc0b11361f9e71a0194,10.48550/arXiv.2505.03530,2505.03530,"[{'source': 'external', 'category': 'Computer ...",2025-09-29 20:01:25.332786+00:00,2025-09-29 20:01:57.450585+00:00
756,3a910119666673ce6d77894055fd356f600ca5e4,Mechanistic Interpretability in the Presence o...,"Marcos Florencio, Thomas Barton",2025,"Architectural obfuscation - e.g., permuting hi...",https://www.semanticscholar.org/paper/3a910119...,https://arxiv.org/pdf/2506.18053.pdf,https://www.semanticscholar.org/paper/3a910119...,arXiv.org,"Computer Science, Computer Science",0,89c5f7d7ddb26766a6eff262d5e0aa34,10.48550/arXiv.2506.18053,2506.18053,"[{'source': 'external', 'category': 'Computer ...",2025-09-29 20:01:24.877895+00:00,2025-09-29 20:01:55.416994+00:00
757,49072764553763f1686121fd03e3dadda259f273,Mechanistic Interpretability of Binary and Ter...,Jason Li,2024,"Recent research (arXiv:2310.11453, arXiv:2402....",https://www.semanticscholar.org/paper/49072764...,https://arxiv.org/pdf/2405.17703.pdf,https://www.semanticscholar.org/paper/49072764...,arXiv.org,"Computer Science, Computer Science",0,81612dabd9dc5de68fc08c32d1ed9a14,10.48550/arXiv.2405.17703,2405.17703,"[{'source': 'external', 'category': 'Computer ...",2025-09-29 20:01:19.685040+00:00,2025-09-29 20:01:48.957507+00:00


# Creating a batch approach

In [44]:
for index, row in academic_data.iterrows():
    print(row['title'])
    break

GPQA: A Graduate-Level Google-Proof Q&A Benchmark


In [45]:
requests = []

for index, row in academic_data.iterrows():
    single_request = {}

    single_request['contents']  = [{
        'parts': [{
            'text': f"""
                # Paper Info

                * title: {row['title']}
                * authors: {row['authors']}
                * abstract: {row['abstract']}
                * keywords: {row['keywords']}
                """
        }]
    }]

    single_request['config'] = {
        'system_instruction': {'parts': [{
            'text': """
                You should assign a single category to the paper given the informantion provided.
                Our goal is to categorize papers related to AI Safety.
                If the paper is not related to AI Safety, asign 'None' to its category.

                Categories available: {paper_topics}

                Provide the response as JSON following this Pydantic schema: {PaperCategory.model_json_schema()}
            """
        }]},
        'response_mime_type': 'application/json',
        'response_schema': PaperCategory
    }
    
    requests.append(single_request)

In [46]:
print(requests[0])

{'contents': [{'parts': [{'text': '\n                # Paper Info\n\n                * title: GPQA: A Graduate-Level Google-Proof Q&A Benchmark\n                * authors: David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, Samuel R. Bowman\n                * abstract: We present GPQA, a challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. We ensure that the questions are high-quality and extremely difficult: experts who have or are pursuing PhDs in the corresponding domains reach 65% accuracy (74% when discounting clear mistakes the experts identified in retrospect), while highly skilled non-expert validators only reach 34% accuracy, despite spending on average over 30 minutes with unrestricted access to the web (i.e., the questions are"Google-proof"). The questions are also difficult for state-of-the-art AI systems, with our strongest GPT-4 based baseline achi

In [47]:
import time

name='batches/6vazui0un6lzsfyhlrg1qf3ipvzpfry4k04n' display_name='structured-output-job-1' state=<JobState.JOB_STATE_PENDING: 'JOB_STATE_PENDING'> error=None create_time=datetime.datetime(2025, 10, 13, 12, 23, 33, 822312, tzinfo=TzInfo(0)) start_time=None end_time=None update_time=datetime.datetime(2025, 10, 13, 12, 23, 33, 822312, tzinfo=TzInfo(0)) model='models/gemini-2.5-flash' src=None dest=None
name='batches/f1mikco318ugvyhiiec01ssbppc1f6dmb8vg' display_name='structured-output-job-1' state=<JobState.JOB_STATE_PENDING: 'JOB_STATE_PENDING'> error=None create_time=datetime.datetime(2025, 10, 13, 9, 55, 49, 929575, tzinfo=TzInfo(0)) start_time=None end_time=None update_time=datetime.datetime(2025, 10, 13, 9, 55, 49, 929575, tzinfo=TzInfo(0)) model='models/gemini-2.5-flash' src=None dest=None
name='batches/qrwe9llbxk3hex83alggi4xjiwchfk2pj98x' display_name='structured-output-job-1' state=<JobState.JOB_STATE_PENDING: 'JOB_STATE_PENDING'> error=None create_time=datetime.datetime(2025, 10

In [7]:
client.batches.get(name="batches/structured-output-job-1")

ClientError: 400 INVALID_ARGUMENT. {'error': {'code': 400, 'message': 'Could not parse the batch name', 'status': 'INVALID_ARGUMENT'}}

In [None]:
inline_batch_job = client.batches.create(
    model="models/gemini-2.5-flash",
    src=requests,
    config={
        'display_name': "structured-output-job-1"
    },
)



Polling status for job: batches/6vazui0un6lzsfyhlrg1qf3ipvzpfry4k04n
Job not finished. Current state: JOB_STATE_PENDING. Waiting 30 seconds...


KeyboardInterrupt: 

In [3]:
# wait for the job to finish
job_name = inline_batch_job.name
print(f"Polling status for job: {job_name}")

while True:
    batch_job_inline = client.batches.get(name=job_name)
    if batch_job_inline.state.name in ('JOB_STATE_SUCCEEDED', 'JOB_STATE_FAILED', 'JOB_STATE_CANCELLED', 'JOB_STATE_EXPIRED'):
        break
    print(f"Job not finished. Current state: {batch_job_inline.state.name}. Waiting 30 seconds...")
    time.sleep(30)

print(f"Job finished with state: {batch_job_inline.state.name}")

# print the response
for i, inline_response in enumerate(batch_job_inline.dest.inlined_responses, start=1):
    print(f"\n--- Response {i} ---")

    # Check for a successful response
    if inline_response.response:
        # The .text property is a shortcut to the generated text.
        print(inline_response.response.text)

NameError: name 'inline_batch_job' is not defined

In [16]:
for job in client.batches.list():
    print(job)

name='batches/6vazui0un6lzsfyhlrg1qf3ipvzpfry4k04n' display_name='structured-output-job-1' state=<JobState.JOB_STATE_SUCCEEDED: 'JOB_STATE_SUCCEEDED'> error=None create_time=datetime.datetime(2025, 10, 13, 12, 23, 33, 822312, tzinfo=TzInfo(0)) start_time=None end_time=datetime.datetime(2025, 10, 14, 23, 37, 27, 675244, tzinfo=TzInfo(0)) update_time=datetime.datetime(2025, 10, 14, 23, 37, 27, 675244, tzinfo=TzInfo(0)) model='models/gemini-2.5-flash' src=None dest=None
name='batches/f1mikco318ugvyhiiec01ssbppc1f6dmb8vg' display_name='structured-output-job-1' state=<JobState.JOB_STATE_SUCCEEDED: 'JOB_STATE_SUCCEEDED'> error=None create_time=datetime.datetime(2025, 10, 13, 9, 55, 49, 929575, tzinfo=TzInfo(0)) start_time=None end_time=datetime.datetime(2025, 10, 14, 23, 14, 19, 899276, tzinfo=TzInfo(0)) update_time=datetime.datetime(2025, 10, 14, 23, 14, 19, 899276, tzinfo=TzInfo(0)) model='models/gemini-2.5-flash' src=None dest=None
name='batches/qrwe9llbxk3hex83alggi4xjiwchfk2pj98x' displ