In [1]:
import pandas as pd
import numpy as np
import time

from google import genai
from google.genai import types, errors
from pydantic import BaseModel, Field
from typing import overload, Union

not_in_new_df = pd.read_excel("output/papers_not_in_new.xlsx")

two_banks = ("ING", "HSBC")

two_banks_df = not_in_new_df[not_in_new_df["company_name"].isin(two_banks)][["company_name", "title", "description", "paper_url"]]

In [2]:
class ResearchPapersRelevanceOutput(BaseModel):
    relevance: str = Field(
        ...,
        description="""
         Is this paper about AI? Only use the values Yes, No or Unsure
       """
    )  

relevance_papers_prompt = """
You are an expert in AI and you will be given a list of research papers. Your task is to determine if the paper is relevant to AI or not. Output a one word answer regrading its relevance: Yes, No or Unsure.
"""

In [3]:
user_prompt = """
        Here is your input:
        {text}
        """
# name of the gemini model we are using
model_name = "google/gemini-2.5-flash"

# gemini client and configuration
google_genai_client = genai.Client(
    vertexai=True,
    project="evident-data-dev",
    location="europe-west1",
)

# setup the model
model_configuration = types.GenerateContentConfig(
    system_instruction=relevance_papers_prompt,
    response_mime_type="application/json",
    response_schema=ResearchPapersRelevanceOutput,
)

def classify_dataframe(text_df: pd.DataFrame, text_column: str, identifiable_column: str):
        """
        Takes a pandas DataFrame of potential use cases and
        dds the required metadata so it can be added to the use case tracker
        Args:
            text_df (pd.DataFrame): press releases
            text_column str: name of the column containing the body of text to use
        Returns:
            A pandas DataFrame containing the LLM JSON output as columns
        """
        classification_list = []
        for i in range(len(text_df)):
            if (i+1) % 100 == 0:
                print("Waiting 1min 30s")
                time.sleep(90)
            try:
                response = google_genai_client.models.generate_content(
                    model=model_name,
                    config=model_configuration,
                    contents=user_prompt.format(text=text_df[text_column].iloc[i]),
                )
            except errors.APIError as e:
                raise ValueError(f"Code:{e.code}", "\n", f"Message: {e.message}")
                
            classification_list.append([i,
                                        text_df[identifiable_column].iloc[i],
                                        text_df[text_column].iloc[i],
                                        response.parsed.relevance
                                        ])
            
            if (i + 1) % 50 == 0 or (i + 1) == len(text_df):
                print(f"Progress: {(i + 1) / len(text_df):.2%}")
        
        return pd.DataFrame(classification_list, columns=[
            "index",
            identifiable_column,
            "body",
            "relevance"
                ])

In [4]:
relevance_df = classify_dataframe(two_banks_df, "description", "paper_url")

relevance_df.head()



Progress: 100.00%


Unnamed: 0,index,paper_url,body,relevance
0,0,/citations?view_op=view_citation&hl=en&user=5o...,Supersymmetric Yang-Mills quantum mechanics by...,No
1,1,/citations?view_op=view_citation&hl=en&user=6d...,Background In the competitive data driven busi...,No
2,2,/citations?view_op=view_citation&hl=en&user=SE...,With the rapid advancement of artificial intel...,Yes
3,3,/citations?view_op=view_citation&hl=en&user=Yh...,Quantum networks are going to disrupt how we p...,Yes
4,4,/citations?view_op=view_citation&hl=en&user=_U...,• A robust deep learning-based stock price pre...,Yes
