Check all available models

In [8]:
import os
from dotenv import load_dotenv
from google import genai

load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY is not set.")

client = genai.Client(api_key=GOOGLE_API_KEY)

for model in client.models.list():
    clean_id = model.name.replace("models/", "")
    d_name = getattr(model, 'display_name', 'Unknown')
    
    # if clean_id.startswith('gemma'):
    if clean_id.startswith('gemini-2.5'):
        print(f"{clean_id:<50} | {d_name}")

gemini-2.5-flash                                   | Gemini 2.5 Flash
gemini-2.5-pro                                     | Gemini 2.5 Pro
gemini-2.5-flash-preview-tts                       | Gemini 2.5 Flash Preview TTS
gemini-2.5-pro-preview-tts                         | Gemini 2.5 Pro Preview TTS
gemini-2.5-flash-lite                              | Gemini 2.5 Flash-Lite
gemini-2.5-flash-image                             | Nano Banana
gemini-2.5-flash-preview-09-2025                   | Gemini 2.5 Flash Preview Sep 2025
gemini-2.5-flash-lite-preview-09-2025              | Gemini 2.5 Flash-Lite Preview Sep 2025
gemini-2.5-computer-use-preview-10-2025            | Gemini 2.5 Computer Use Preview 10-2025
gemini-2.5-flash-native-audio-latest               | Gemini 2.5 Flash Native Audio Latest
gemini-2.5-flash-native-audio-preview-09-2025      | Gemini 2.5 Flash Native Audio Preview 09-2025
gemini-2.5-flash-native-audio-preview-12-2025      | Gemini 2.5 Flash Native Audio Preview 12-2025


This is the code to test the keyword/attribute extraction results. Specifically, we need to check the frequency of each value.

In [None]:
import pandas as pd 
import numpy as np
import ast
import re
import json

def parse_schema_value(val):
    """解析schema值，处理可能的markdown代码块标记"""
    if pd.isna(val):
        return {}
    val = str(val).strip()
    # 移除markdown代码块标记
    if val.startswith('```'):
        val = re.sub(r'^```(?:json)?\s*', '', val)
        val = re.sub(r'\s*```$', '', val)
    val = val.strip()
    try:
        return json.loads(val)
    except json.JSONDecodeError:
        try:
            return ast.literal_eval(val)
        except:
            return {}

def parse_schema_file(schema_path):
    """解析schema文件，返回每个字段名及其最后一个选项"""
    with open(schema_path, 'r') as f:
        content = f.read()
    
    field_last_options = {}
    
    # 尝试匹配两种格式:
    # 格式1: "**field_name**": "<option1|option2|...|last_option>"
    # 格式2: "field_description": "<option1|option2|...|last_option>"
    
    # 先尝试格式1 (带**)
    pattern1 = r'"?\*\*([^*]+)\*\*"?\s*:\s*"<([^>]+)>"'
    matches = re.findall(pattern1, content)
    
    # 如果格式1没有匹配到，尝试格式2 (普通字段名)
    if not matches:
        pattern2 = r'"([^"]+)"\s*:\s*"<([^>]+)>"'
        matches = re.findall(pattern2, content)
    
    for field_name, options_str in matches:
        # 分割选项（按|分割，但需要处理嵌套的括号和星号）
        options = []
        current = ""
        paren_depth = 0
        star_depth = 0
        for char in options_str:
            if char == '(':
                paren_depth += 1
                current += char
            elif char == ')':
                paren_depth -= 1
                current += char
            elif char == '*':
                # 处理 *text* 格式
                if star_depth == 0:
                    star_depth = 1
                else:
                    star_depth = 0
                current += char
            elif char == '|' and paren_depth == 0 and star_depth == 0:
                if current.strip():
                    options.append(current.strip())
                current = ""
            else:
                current += char
        if current.strip():
            options.append(current.strip())
        
        if options:
            field_last_options[field_name] = options[-1]
    
    return field_last_options

def analyze_schema_stats(csv_filename, schema_filename):
    """
    分析CSV文件中schema列的统计数据
    
    参数:
        csv_filename: CSV文件名（位于data/biorxiv/目录下）
        schema_filename: schema文件名（位于annotation/schema/目录下）
    """
    # 读取数据
    total_df = pd.read_csv(f'data/biorxiv/{csv_filename}')
    
    # 读取schema
    schema_last_options = parse_schema_file(f'annotation/schema/{schema_filename}')
    print("Schema字段及其最后一个选项:")
    for field, last_opt in schema_last_options.items():
        print(f"  {field}: {last_opt}")
    print()
    
    # 将schema列转换为DataFrame
    schema_df = pd.DataFrame(total_df['schema'].apply(parse_schema_value).tolist())
    
    # 统计结果
    print("统计结果:")
    print("-" * 60)
    total_rows = len(schema_df)
    
    for col in schema_df.columns:
        matched_field = None
        matched_last_option = None
        
        # 尝试精确匹配或部分匹配
        col_lower = col.lower().replace('_', ' ')
        for field, last_opt in schema_last_options.items():
            field_lower = field.lower().replace('_', ' ')
            if col_lower in field_lower or field_lower in col_lower:
                matched_field = field
                matched_last_option = last_opt
                break
        
        if matched_last_option:
            last_opt_lower = matched_last_option.lower()
            if 'other' in last_opt_lower or 'applicable' in last_opt_lower:
                count = (schema_df[col] == matched_last_option).sum()
                if count == 0:
                    count = schema_df[col].str.lower().str.contains('other|not applicable', na=False).sum()
                percentage = count / total_rows * 100
                print(f"{col}:")
                print(f"  最后选项: {matched_last_option}")
                print(f"  匹配数量: {count} / {total_rows} ({percentage:.2f}%)")
            else:
                print(f"{col}:")
                print(f"  最后选项: {matched_last_option}")
                print(f"  (跳过统计 - 不包含'other'或'applicable')")
        else:
            print(f"{col}:")
            print(f"  (未在schema中找到匹配字段)")
        print()


In [11]:
analyze_schema_stats('biorxiv_schema_noexample_valid.csv', 'schema_noexample_test.txt')

Schema字段及其最后一个选项:
  primary_biological_domain: Other
  experimental_approach_focus: Other
  key_molecular_target_class: Not Applicable (e.g.|purely behavioral study)
  level_of_abstraction_(system_scale): Other
  research_type/contribution_category: Data Set/Resource Release
  level_of_technical_formality_(text_complexity_proxy): Mixed/Introductory Focus
  disease_relevance_flag: Other Human Condition (e.g.|Aging|Neurological Disorder)
  bioinformatics_tool_mentioned_(conditional): Custom Tool/Code Release

统计结果:
------------------------------------------------------------
primary_biological_domain:
  最后选项: Other
  匹配数量: 167 / 3000 (5.57%)

experimental_approach_focus:
  最后选项: Other
  匹配数量: 109 / 3000 (3.63%)

key_molecular_target_class:
  最后选项: Not Applicable (e.g.|purely behavioral study)
  匹配数量: 901 / 3000 (30.03%)

level_of_abstraction_(system_scale):
  最后选项: Other
  匹配数量: 10 / 3000 (0.33%)

research_type/contribution_category:
  最后选项: Data Set/Resource Release
  (跳过统计 - 不包含'other'

In [12]:
analyze_schema_stats('biorxiv_schema_example50_valid.csv', 'schema_example50_test.txt')

Schema字段及其最后一个选项:
  captures_the_broad_biological_sub-field_addressed_by_the_paper.: Other.
  identifies_the_primary_organism_or_system_used_for_the_study.: *Other Organism*.
  categorizes_the_primary_type_of_methodology_employed.: Other Experimental Methods.
  distinguishes_between_papers_focusing_on_*how*_a_process_works_(mechanistic_depth)_versus_papers_focused_on_*mapping*_a_system_or_phenotype.: Tool/Method Development.
  a_general_feature_assessing_the_writing_style_and_structure,_often_indicative_of_the_paper's_type_(e.g.,_methods_vs._hypothesis-driven).: Other.
  focuses_on_the_level_of_resolution_where_the_main_discovery_occurs.: Population/Ecosystem Level.
  separates_studies_performed_in_a_living_organism_from_those_performed_in_controlled,_simplified_systems.: Computational/Theoretical Only.
  a_feature_specifically_targeting_crucial_cross-cutting_themes_often_present_in_biorxiv,_particularly_relating_to_disease_or_environment.: Not Applicable.

统计结果:
----------------------