In [4]:
class ResearchPapersMetadataOutput(BaseModel):
    summary: str = Field(
        ...,
        description="""
        Summarize the research paper in a concise, yet technical manner. Always include relevant details about the technical implementation (if available). 

        Please return the summary as short one line description of the paper.
        
        Examples of summaries to illustrate how I want them to look like:
        - The paper critiques traditional industry approaches and offers a refined, audience-specific definition of explainability. Its influence lies in the structured visual taxonomies and conceptual frameworks it provides across model types - making it a foundational reference for ongoing XAI research.

        - The paper tracks the evolution of face recognition, from shallow to deep methods, identifying key technical advancements through network architectures, loss functions and data processing/augmentation techniques.

        - The paper surveys the shift from traditional face recognition methods to deep learning approaches that achieve state-of-the-art performance across complex scenarios. Its influence stems from a systematic categorization of network architectures and a comprehensive taxonomy.

        - BLOOM is an open-access response to proprietary, English-centric language models developed by a small pool of tech companies. The model's training data spans academic texts, local news sources, community contributions, and low-resource languages.
       """
    )  
    paper_type: str = Field(
        ...,
        description="""
        What is the paper's core function or intent?
        Select one of the following:
        - Technical Innovation/Methodological Proposal
        - Benchmark/Empirical Evaluation
        - Literature Review
        - Survey/Taxonomy
        - Conceptual/Theoretical Discussion
        - Engineering/Systems Description
        - Case Study/Application Report
        - Dataset Release
        - Tool/Library Introduction
        - Other
        """
    )
    application_domain: str = Field(
        ...,
        description="""
        What domain or field is the research applied to?
        Note: Use "Multilingual/Cross-Lingual NLP" when the paper's primary focus is on models, datasets, or evaluation across multiple languages, or transfer between languages.
        If the paper addresses language tasks in a single language context (e.g., English sentiment classification), select "NLP" instead.
        Note: If the paper focuses on the autonomous coordination of tasks, tools, or decisions by a goal-directed system, classify it under AI Agents/Agent-Based Systems.
        If it focuses on generating, repairing, or synthesizing code, classify it under Code Generation/Software Engineering in Application Domain or Code Generation in Research Theme if it's the technical focus.
         
        Select one of the following:
        - Natural Language Processing (NLP)
        - Multilingual/Cross-Lingual NLP
        - Computer Vision
        - Time Series/Forecasting
        - Multimodal (e.g., vision-language)
        - Robotics/Control
        - Recommender Systems
        - Financial Modelling/Risk Analysis
        - Scientific Discovery
        - Healthcare/Biomedicine
        - Knowledge Management/Retrieval
        - Document Processing
        - Automation/Workflow Optimization
        - Social/Ethical Implications
        - General-Purpose/Domain-Agnostic
        - Code Generation/Software Engineering
        - AI Agents/Agent-Based Systems
        - Other
        """
    )
    method: str = Field(
        ...,
        description="""
        Which machine learning paradigm or model class is used?

        Select one of the following:
        - Transformer-based Models
        - Graph Neural Networks
        - Convolutional Neural Networks
        - Recurrent Neural Networks
        - Probabilistic Models/Bayesian Inference
        - Evolutionary Algorithms/Heuristics
        - Reinforcement Learning (RL)
        - RLHF (Reinforcement Learning from Human Feedback)
        - Generative Models (e.g., GANs, VAEs, Diffusion)
        - Symbolic/Logic-based Systems
        - Retrieval-Augmented Generation (RAG)
        - Foundation Models/Large Language Models
        - Unsupervised Learning (e.g., clustering, dimensionality reduction)
        - Supervised Learning (e.g., classification, regression)
        - Other
        """
    )
    purpose: str = Field(
        ...,
        description="""
        What is the primary aim of the paper's research effort?

        Select one of the following:
        - Model Benchmarking/Performance Comparison
        - Scaling Law Analysis
        - Training Efficiency/Optimization
        - Model Capabilities/Theory-Led Innovation
        - Interpretability/Explainability
        - Fairness/Bias Analysis
        - Robustness/Adversarial Evaluation
        - Human-AI Interaction
        - Model Risk/Governance
        - Prompt Engineering/Instruction Tuning
        - Other
        """
    )
    model_stage_focus: str = Field(
        ...,
        description="""
        What part of the AI pipeline does the paper focus on?

        Select one of the following:
        - Model Architecture/Design
        - Training Algorithms/Objectives
        - Fine-Tuning/Adaptation
        - Inference Optimization
        - Evaluation/Metrics
        - Deployment/Integration
        - Other

        """
    )
    research_main_theme: str = Field(
        ...,
        description="""
        What is the paper's most relevant technical theme?
        
        Select one of the following:
        - Model Architecture Based Efficiency
        - Training Strategies
        - Novel Model Architectures
        - Evaluation & Robustness
        - Infrastructure & Deployment
        - Governance & Policy
        - Reasoning & Emerging Behavior
        """
    )
    research_sub_theme: str = Field(
        ...,
        description="""
        What is the paper's most relevant sub-technical theme based on the main theme selected?

        If no sub-theme fits, use Other (explain)
        Note: Use the Other (explain)
        
        Select one of the sub-themes under the main categories below or other:
        Model Architecture Based Efficiency:
        - Sparse Attention/Efficient Attention
        - Low-Rank Adaptation (LoRA)/Weight-Decomposed Low-Rank Adaptation (DoRA)
        - Quantization/Pruning
        - Mixture of Experts (MoE)
        - Neural Architecture Search (NAS)
        
        
        Training Strategies:
        - Instruction Tuning/Preference Modelling
        - Policy Gradient Methods
        - Contrastive Learning
        - Self-Supervised Learning
        - Meta-Learning
        - Knowledge Distillation
        - Curriculum Learning
        - Few-Shot/Zero-Shot Learning
        - Regularizers/Schedulers
        
        
        Novel Model Architectures:
        - Vision Transformers
        - Mamba
        - ResNet
        - Perceiver IO
        - Diffusion Transformers
        - Gated CNNs
        
        
        Evaluation & Robustness:
        - Explainability/Interpretability
        - Adversarial Robustness
        - Calibration/Uncertainty Quantification
        - Algorithmic Fairness
        - Evaluation Frameworks/Benchmarks
        
        
        Infrastructure & Deployment:
        - Compiler Optimizations (e.g., XLA, TVM)
        - Hardware-Aware Training/Inference
        - Federated Learning Systems
        - Edge/On-Device Deployment
        - Model Compression
        - Data-Centric Research
        - Synthetic Data Generation
        - Data Augmentation Strategies
        - Label Noise/Quality Analysis
        - Active Learning/Human-in-the-Loop
        - Semi-Supervised Data Strategies
        
        
        Governance & Policy:
        - Responsible AI Governance
        - AI Alignment
        - Catastrophic AI Risk
        - Auditing/Red Teaming
        - Regulatory Compliance/Standardization
        
        
        Reasoning & Emerging Behavior:
        - Multimodal Reasoning
        - Chain-of-Thought Reasoning
        - Emergent Behaviors in LLMs
        - Tool Use/Programmatic Reasoning
        - Long-Horizon Planning
        """
    )


research_papers_prompt = """
You are an expert research assistant tasked with classifying AI research papers. These papers are written by authors affiliated with financial services, but you must not assume that their subject matter is related to financial services.You are classifying the core research contribution, not its institutional origin or setting.
Your goal is to assign one label per category based on the technical content of each paper, using the predefined taxonomy below. Please return your output in a JSON format. 

General Instructions:
- Assign only one label per category.
- Do not use "Other" unless the paper clearly falls outside all listed categories or spans multiple equally without a dominant focus. However, do not 'force fit' the paper into a category if it doesn't meaningfully apply.
- If multiple labels seem appropriate, choose the one most emphasized in the abstract or framed as the paper's novel contribution.
- 'Scene setting' or background context does not necessarily indicate the main technical focus.
- If you select "Other" in any category, include a brief rationale. Avoid using "Other" in more than one category unless absolutely necessary - and justify each use separately.
- Consider cross-category dependencies: e.g., if the paper introduces a novel architecture like ResNet, the method should likely correspond to Convolutional Neural Networks.
- Some foundational or general-purpose papers may not map neatly to a single domain. In those cases, choose General-Purpose/Domain-Agnostic under Application Domain.
- For Research Theme, return both the Main Theme and the Sub-theme in the format:
 Main Theme → Sub-theme
 If no sub-theme fits, write:
 Main Theme → Other (explain)

Summary:

Please return the summary as short one line description of the paper.
        
        Examples of summaries to illustrate how I want them to look like:
        - The paper critiques traditional industry approaches and offers a refined, audience-specific definition of explainability. Its influence lies in the structured visual taxonomies and conceptual frameworks it provides across model types - making it a foundational reference for ongoing XAI research.

        - The paper tracks the evolution of face recognition, from shallow to deep methods, identifying key technical advancements through network architectures, loss functions and data processing/augmentation techniques.

        - The paper surveys the shift from traditional face recognition methods to deep learning approaches that achieve state-of-the-art performance across complex scenarios. Its influence stems from a systematic categorization of network architectures and a comprehensive taxonomy.

        - BLOOM is an open-access response to proprietary, English-centric language models developed by a small pool of tech companies. The model's training data spans academic texts, local news sources, community contributions, and low-resource languages.

Classification Categories:

Paper Type:
What is the paper's core function or intent?
- Technical Innovation/Methodological Proposal
- Benchmark/Empirical Evaluation
- Literature Review
- Survey/Taxonomy
- Conceptual/Theoretical Discussion
- Engineering/Systems Description
- Case Study/Application Report
- Dataset Release
- Tool/Library Introduction
- Other
"""

In [5]:
import time
import pandas as pd

from google import genai
from google.genai import types, errors
from pydantic import BaseModel, Field
from typing import overload, Union


# name of the gemini model we are using
model_name = "google/gemini-2.5-flash"

# gemini client and configuration
google_genai_client = genai.Client(
    vertexai=True,
    project="evident-data-dev",
    location="europe-west1",
)

# setup the model
model_configuration = types.GenerateContentConfig(
    system_instruction=research_papers_prompt,
    response_mime_type="application/json",
    response_schema=ResearchPapersMetadataOutput,
)

def classify_dataframe(text_df: pd.DataFrame, text_column: str, identifiable_column: str):
        """
        Takes a pandas DataFrame of potential use cases and
        dds the required metadata so it can be added to the use case tracker
        Args:
            text_df (pd.DataFrame): press releases
            text_column str: name of the column containing the body of text to use
        Returns:
            A pandas DataFrame containing the LLM JSON output as columns
        """
        classification_list = []
        for i in range(len(text_df)):
            if (i+1) % 100 == 0:
                print("Waiting 1min 30s")
                time.sleep(90)
            try:
                response = google_genai_client.models.generate_content(
                    model=model_name,
                    config=model_configuration,
                    contents=user_prompt.format(text=text_df[text_column].iloc[i]),
                )
            except errors.APIError as e:
                raise ValueError(f"Code:{e.code}", "\n", f"Message: {e.message}")
                
            classification_list.append([i,
                                        text_df[identifiable_column].iloc[i],
                                        text_df[text_column].iloc[i],
                                        response.parsed.summary,
                                        response.parsed.paper_type,
                                        response.parsed.application_domain,
                                        response.parsed.method,
                                        response.parsed.purpose,
                                        response.parsed.model_stage_focus,
                                        response.parsed.research_main_theme,
                                        response.parsed.research_sub_theme
                                        ])
            
            if (i + 1) % 50 == 0 or (i + 1) == len(text_df):
                print(f"Progress: {(i + 1) / len(text_df):.2%}")
        
        return pd.DataFrame(classification_list, columns=[
            "index",
            identifiable_column,
            "body",
            "summary",
            "paper_type",
            "application_domain",
            "method",
            "purpose",
            "model_stage_focus",
            "research_main_theme",
            "research_sub_theme"
        ])