In [1]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience')
import asyncio
from call_api import call_gemini, async_call_gemini
from tqdm.asyncio import tqdm_asyncio

field_name = "STEM-OPT"
field_path = f"/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_csv/{field_name}.csv"
field_df = pd.read_csv(field_path)

field_df.shape

(351, 9)

In [5]:
prompt_template = """
You are an assistant whose only task is to decide whether the graduate program below is **officially designated as STEM-OPT eligible**.

────────────────────────────────────────────────────────
Context (for guidance, NOT for guessing)  
• Engineering, Computer Science, Mathematics, and Natural Sciences (physics, chemistry, biology) programs are almost always STEM;  
  Humanities, arts, law, traditional business, and most social-science programs are usually not.  
• Nevertheless, return **stem** **only** when the university’s *.edu* pages explicitly state STEM or STEM-OPT eligibility.  
• If the official site is silent, output **Not stated in official website**.

────────────────────────────────────────────────────────
How to verify  

1. **Primary *.edu* sources only**  
   • Check the Admissions and Program URLs provided.  
   • You may open other pages on the same *.edu* domain (e.g., “STEM Designation”, “International Students”, “OPT Extension”, CIP code listings).  
   ⛔  Ignore non-*.edu* sites, blogs, or rankings.

2. **Optional Google search**  
   Single query:  
   "{university} {department} {degree} {program} STEM OPT site:.edu"  
   Review *.edu* results only.

3. **Decision rules**  
   • If you find wording such as “This program is STEM-designated” or “Eligible for the 24-month STEM OPT extension”, output **stem**.  
   • Otherwise output **Not stated in official website**.  
   • Never infer or invent.

────────────────────────────────────────────────────────
⚠️  Output format (exactly one line, lowercase, no quotes, no extra text)  

stem  
Not stated in official website  

────────────────────────────────────────────────────────
Pages to consult first:  
• Admissions URL: {admissions_url}  
• Program URL:    {program_url}

Is this program STEM-OPT eligible?
"""


In [6]:
import os
import json
import asyncio
from tqdm.asyncio import tqdm_asyncio

# Async Gemini wrapper
from call_api import async_call_gemini

# ---------------------------------------------------------------------------
# Concurrency guard – avoid hitting rate-limits
# ---------------------------------------------------------------------------
semaphore = asyncio.Semaphore(4)            # max concurrent rows

# ---------------------------------------------------------------------------
# Per-row worker
# ---------------------------------------------------------------------------
async def process_row(row, prompt_template, num_vote: int, model_name: str):
    """
    1. Format the prompt for this row
    2. Launch `num_vote` Gemini calls in parallel
    3. Capture BOTH normal answers *and* every possible error case
    4. Return a serialisable record
    """
    async with semaphore:
        row    = row.to_dict()
        prompt = prompt_template.format(
            university     = row["大学英文名称"],
            degree         = row["学位"],
            program        = row["专业英文名称"],
            department     = row["所属院系（英文）"],
            admissions_url = row["招生网址"],
            program_url    = row["专业网址"],
        )

        record: dict = row.copy()
        record["llm_reponses"] = {}

        # -------- launch Gemini calls in parallel --------------------
        tasks = [
            async_call_gemini(
                prompt,
                model_name=model_name,
                use_search=True,
                url_context=True
            )
            for _ in range(num_vote)
        ]
        responses = await asyncio.gather(*tasks)

        # -------- post-process each response -------------------------
        for i, response in enumerate(responses):
            resp_key = f"response {i+1}"

            # -- 1. Transport / server-side errors (string starting "Error:")
            if isinstance(response, str) and response.startswith("Error:"):
                record["llm_reponses"][resp_key] = {
                    "error": response                       # e.g. "Error: 429 Rate limit …"
                }
                continue

            # -- 2. Empty / malformed response objects
            if not hasattr(response, "candidates") or not response.candidates:
                record["llm_reponses"][resp_key] = {
                    "error": "No candidates returned",
                    "raw_response": str(response)
                }
                continue

            # -- 3. Extract main answer text
            try:
                text = response.candidates[0].content.parts[0].text
            except Exception as e:
                record["llm_reponses"][resp_key] = {
                    "error": f"Cannot parse text: {e}",
                    "raw_response": str(response)
                }
                continue

            # -- 4. Extract additional metadata (best-effort)
            try:
                url_context = str(response.candidates[0].url_context_metadata)
            except Exception:
                url_context = "Not used"

            try:
                search_pages = (
                    f"Search Chunks: "
                    f"{response.candidates[0].grounding_metadata.grounding_chunks}"
                )
            except Exception:
                search_pages = "Not used"

            try:
                search_queries = (
                    f"Search Query: "
                    f"{response.candidates[0].grounding_metadata.web_search_queries}"
                )
            except Exception:
                search_queries = "Not used"

            try:
                search_support = (
                    f"Search Supports: "
                    f"{response.candidates[0].grounding_metadata.groundingSupports}"
                )
            except Exception:
                search_support = "Not used"

            # -- 5. Store normal answer + metadata + raw object
            record["llm_reponses"][resp_key] = {
                "response_text": text,
                "url_context": url_context,
                "search_queries": search_queries,
                "search_pages": search_pages,
                "search_support": search_support,
                "raw_response": str(response)             # keep for deep-debugging
            }

        return record

# ---------------------------------------------------------------------------
# Batch orchestrator with tqdm progress bar
# ---------------------------------------------------------------------------
async def request_and_store_async(prompt_template,
                                  field_df,
                                  num_vote: int,
                                  model_name: str,
                                  start_from: int = 0,
                                  end_at: int = -1):
    """
    Runs `process_row` over the dataframe slice asynchronously,
    shows a live tqdm bar, and dumps the results to JSON.
    """
    df = field_df.copy()[start_from:end_at]

    # Spawn tasks for every row in the slice
    tasks = [
        process_row(row, prompt_template, num_vote, model_name)
        for _, row in df.iterrows()
    ]

    # tqdm_asyncio.gather gives us progress updates as tasks complete
    response_records = await tqdm_asyncio.gather(*tasks)

    # Persist to disk ------------------------------------------------
    output_dir = f"../fields_records/{field_name}"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/{field_name}_{model_name}_{start_from}_{end_at}.json"

    with open(output_path, "w") as f:
        json.dump(response_records, f, ensure_ascii=False, indent=2)

    return response_records

In [7]:
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter

num_vote = 3
start_from = 0
end_at = 30
model_name = "gemini-2.5-flash"
response_records = asyncio.run(
    request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=start_from, end_at=end_at)
)

100%|██████████| 30/30 [02:34<00:00,  5.14s/it]


In [None]:
def majority_vote(records):
    for rec in records:
        responses = rec["llm_reponses"]
        candidate_answers = [r for r in responses.values() if "no explicit constraints" not in r]
        if len(candidate_answers) > 0:
            rec["同申互斥"] = candidate_answers[0]
        else:
            rec["同申互斥"] = "no explicit constraints"
    return records
num_vote = 3
model_name = "gemini-2.5-flash"
field_name = "同申互斥"
accuracy = "median_accuracy"
with open(f"../fields_records/{field_name}_{model_name}_0_-1.json", "r") as f:
    records = json.load(f)
records = majority_vote(records)
with open(f"../fields_records/{field_name}_{model_name}_{accuracy}.json", "w") as f:
    json.dump(records, f, ensure_ascii=False, indent=2)