In [20]:
!!pip install anthropic




In [21]:
!pip install openpyxl



In [None]:
from pathlib import Path


# define the file path
file_path = Path("papers 250") / tmp_pmid / f"{tmp_pmid}.checked.md"

# read the file contents into a string
with file_path.open("r", encoding="utf-8") as f:
    paper_content = f.read()

print(paper_content[:500])   # preview first 500 chars


# Trends of subtype variation of human immunodeficiency virus type 1 in Zhejiang Province, China

## Abstract

The epidemic characteristics of human immunodeficiency virus type 1
(HIV-1) in Zhejiang Province have not been systematically identified.
This study presented a dynamic analysis of HIV-1 subtype variation in
Zhejiang from 2004 to 2008, based on the surveillance of molecular
epidemiology or drug resistance. CRF01_AE was the major strain (43.5 %)
spreading across the province, second by B


In [3]:
import pandas as pd

# read the Excel file, first sheet
df = pd.read_excel("Fine-tuning instruction set, Aug 22.xlsx", sheet_name=0)

# filter rows where PMID = 19686436
filtered_df = df[df["PMID"] == int(tmp_pmid)]

print(len(filtered_df), filtered_df)


19           PMID  QID                                           Question  \
47    25410050    1  Does the paper report HIV sequences from patie...   
297   25410050    2  Does the paper report in vitro drug susceptibi...   
547   25410050    3  What were the GenBank accession numbers for se...   
797   25410050    4       Which HIV species were studied in the paper?   
1047  25410050    5  Which HIV genes were reported to have been seq...   
1297  25410050    6  From which countries were the sequenced sample...   
1547  25410050    7  From what years were the sequenced samples obt...   
1797  25410050    8               What method was used for sequencing?   
2047  25410050    9           Were samples cloned prior to sequencing?   
2297  25410050   10      Did samples undergo single genome sequencing?   
2547  25410050   11               What type of samples were sequenced?   
2797  25410050   12  Were any sequences obtained from individuals w...   
3047  25410050   13  Were any seque

In [6]:
with open("explain_multi_questions.txt", "r", encoding="utf-8") as f:
    template = f.read()
    
# --- Step 4: Create prompts per question ---
prompts = []
for _, row in filtered_df.iterrows():
    question = row["Question"]
    prompt = template.format(
        paper_content=paper_content,
        question=question
    )
    prompts.append({
        "QID": row["QID"],
        "PMID": row["PMID"],
        "prompt": prompt
    })

print(prompts[0])

{'QID': 1, 'PMID': 25410050, 'prompt': 'Read the paper in "Paper content" section, and answer a list of questions in "Questions" section below,\n\n## For each question:\n\nStep 1: get the question and question id, the number at the start of each question, store as "question id".\nStep 2: extract two or three sentences from the "paper content" that can be used to answer the question, separate them using \'.\', store as \'evidence\'.\nStep 3: provide the rationale about how you found the answer from the content in details, store as \'rationale\'.\nStep 4: answer the question, store as \'answer\'. If the question expects a boolean answer, \'answer\' should start with "Yes" or "No".\nStep 5: format your answer in the format:\n\n"""\nQuestion: <question id> \n\nEvidence: <evidence>\n\nRationale: <rationale>\n\nAnswer: <answer>\n"""\n\nMake sure you answer all the questions.\n\n\n## Paper content:\n\n```\n# Trends of subtype variation of human immunodeficiency virus type 1 in Zhejiang Provin

In [70]:
questions_block = ""
for _, row in filtered_df.iterrows():
    qid = row["QID"]
    qtext = row["Question"]
    questions_block += f"{qid}. {qtext}\n"

# --- Step 5: Fill the template ---
final_prompt = template.format(
    paper_content=paper_content,
    question=questions_block.strip()
)

# --- Step 6: Save final prompt ---
output_file = Path(f"{tmp_pmid}_18_questions_prompt.txt")
with output_file.open("w", encoding="utf-8") as f:
    f.write(final_prompt)
print(final_prompt)

Read the paper in "Paper content" section, and answer a list of questions in "Questions" section below,

## For each question:

Step 1: get the question, store as "question".
Step 2: extract two or three sentences from the "paper content" that can be used to answer the question, separate them using '.', store as 'evidence'.
Step 3: provide the rationale about how you found the answer from the content in details, store as 'rationale'.
Step 4: answer the question, store as 'answer'.
Step 5: format your answer in the format:

"""
Question: <question id> 

Evidence: <evidence>

Rationale: <rationale>

Answer: <answer>
"""

Make sure you answer all the questions.


## Paper content:

```
# Trends of subtype variation of human immunodeficiency virus type 1 in Zhejiang Province, China

## Abstract

The epidemic characteristics of human immunodeficiency virus type 1
(HIV-1) in Zhejiang Province have not been systematically identified.
This study presented a dynamic analysis of HIV-1 subtype va

In [None]:
from anthropic import Anthropic

## doc: https://docs.claude.com/en/docs/about-claude/models/overview

client = Anthropic(api_key=claude_api_key)

response = client.messages.create(
    model="claude-opus-4-1-20250805",
    max_tokens=3000,
    messages=[
        {"role": "user", "content": final_prompt}
    ]
)

answer = response.content[0].text

# --- Step 1: Token usage ---
input_tokens = response.usage.input_tokens
output_tokens = response.usage.output_tokens
total_tokens = input_tokens + output_tokens

print(f"Input tokens: {input_tokens}")
print(f"Output tokens: {output_tokens}")
print(f"Total tokens: {total_tokens}")



Input tokens: 4221
Output tokens: 2423
Total tokens: 6644


In [73]:
# --- Step 2: Pricing (Claude Opus 4.1 as of 2025-09) ---
# Input: $15.00 per 1M tokens = $0.000015 per token
# Output: $75.00 per 1M tokens = $0.000075 per token
input_cost = input_tokens * 0.000015
output_cost = output_tokens * 0.000075
total_cost = input_cost + output_cost

print(f"Estimated cost: ${total_cost:.4f}")

Estimated cost: $0.2450


In [None]:
import re
# --- Step 2: Split Claude’s output into blocks ---
# Matches the format:
# Question: ...
# Evidence: ...
# Rationale: ...
# Answer: ...
answer = re.sub(r"\*+", "", answer)

# Step 2: regex to match Question/Evidence/Rationale/Answer blocks
pattern = (
    r"Question:\s*(\d+).*?"
    r"Evidence:\s*(.*?)\n\s*Rationale:\s*(.*?)\n\s*Answer:\s*(.*?)(?=(?:\n\s*Question:|\"\"\"|\Z))"
)


matches = re.findall(pattern, answer, re.DOTALL)

print(matches)
# --- Step 3: Build dataframe ---
records = []

for i, (q, ev, rat, ans) in enumerate(matches):
    # strip extra whitespace
    q, ev, rat, ans = q.strip(), ev.strip(), rat.strip(), ans.strip()

    # match to QID + original Question
    if i < len(filtered_df):
        row = filtered_df.iloc[i]
        records.append({
            "PMID": row["PMID"],
            "QID": row["QID"],
            "Question": row["Question"],
            "Evidence": ev,
            "Rationale": rat,
            "Answer": ans
        })

output_df = pd.DataFrame(records)

# --- Step 4: Save to Excel ---
output_df.to_excel(f"claude_answers_{tmp_pmid}_18.xlsx", index=False)


[('1', 'Viral RNA was extracted from patient plasma (140 μl) using QIAamp Viral RNA Mini kit (Qiagen, USA). HIV-1 cDNA was obtained through RT-PCR using the Takara One Step RNA PCR kit (Takara, China) and then subjected to nested polymerase chain reaction (PCR) for the amplification of gag and pol gene fragments. Totally, 332 gag and 229 pol gene fragments were amplified and sequenced from the 451 individual samples, respectively.', 'The paper clearly describes extracting viral RNA from patient plasma samples and sequencing HIV-1 gene fragments (gag and pol) from 451 HIV-1-positive patients. The sequences were obtained directly from patient samples collected between 2004-2008.', 'Yes\n'), ('2', 'This study presented a dynamic analysis of HIV-1 subtype variation in Zhejiang from 2004 to 2008, based on the surveillance of molecular epidemiology or drug resistance. This study was based on the different surveillance of molecular epidemiology and drug resistance.', 'While the paper mentions

In [75]:
print(answer)
output_file = Path(f"{tmp_pmid}_18_questions_answer.txt")
with output_file.open("w", encoding="utf-8") as f:
    f.write(answer)

I'll analyze the paper and answer each question systematically.

"""
Question: 1

Evidence: Viral RNA was extracted from patient plasma (140 μl) using QIAamp Viral RNA Mini kit (Qiagen, USA). HIV-1 cDNA was obtained through RT-PCR using the Takara One Step RNA PCR kit (Takara, China) and then subjected to nested polymerase chain reaction (PCR) for the amplification of *gag* and *pol* gene fragments. Totally, 332 *gag* and 229 *pol* gene fragments were amplified and sequenced from the 451 individual samples, respectively.

Rationale: The paper clearly describes extracting viral RNA from patient plasma samples and sequencing HIV-1 gene fragments (gag and pol) from 451 HIV-1-positive patients. The sequences were obtained directly from patient samples collected between 2004-2008.

Answer: Yes
"""

"""
Question: 2

Evidence: This study presented a dynamic analysis of HIV-1 subtype variation in Zhejiang from 2004 to 2008, based on the surveillance of molecular epidemiology or drug resistance

In [None]:
# gpt-4o

In [4]:
# 60 questions
# read the Excel file, first sheet
df_60 = pd.read_excel("60questions.xlsx", sheet_name=0)

# get all id and question column questions in a list, id followed by question
# 
# make list into \n separated, str 

combined_list = df_60.apply(lambda row: f"{row['id']}: {row['question']}", axis=1).tolist()

# Join into one string with newline separators
output_str = "\n".join(combined_list)

print(output_str)


1101: Does the paper report previously unpublished data?
1102: Does the paper report HIV sequences?
1103: Does the paper report the results of in vitro passage experiments?
1104: Does the paper report in vitro antiretroviral susceptibility data?
2101: Does the paper report GenBank accession numbers for sequenced HIV isolates?
2102: Does the paper report GenBank accession numbers for sequenced HIV isolates other than those for laboratory HIV isolates?
2103: Which are the genbank accession numbers reported in the paper?
2202: Does the paper report lists of mutations for individual sequenced HIV isolates?
2301: Which HIV species were studied in the paper?
2302: What were the subtypes of the sequenced viruses reported in the paper?
2303: Which HIV genes were sequenced in the paper?
2304: Does the paper report the results of HIV pol sequences?
2401: Which geographic regions and/or countries were the sequences from in the paper?
2402: What years were the sequenced samples obtained in the pap

In [7]:
questions_block = output_str


prompt_60 = template.format(
    paper_content=paper_content,
    question=questions_block.strip()
)

# --- Step 6: Save final prompt ---
output_file = Path(f"{tmp_pmid}_60_questions_prompt.txt")
with output_file.open("w", encoding="utf-8") as f:
    f.write(prompt_60)
print(prompt_60)

Read the paper in "Paper content" section, and answer a list of questions in "Questions" section below,

## For each question:

Step 1: get the question and question id, the number at the start of each question, store as "question id".
Step 2: extract two or three sentences from the "paper content" that can be used to answer the question, separate them using '.', store as 'evidence'.
Step 3: provide the rationale about how you found the answer from the content in details, store as 'rationale'.
Step 4: answer the question, store as 'answer'. If the question expects a boolean answer, 'answer' should start with "Yes" or "No".
Step 5: format your answer in the format:

"""
Question: <question id> 

Evidence: <evidence>

Rationale: <rationale>

Answer: <answer>
"""

Make sure you answer all the questions.


## Paper content:

```
# Trends of subtype variation of human immunodeficiency virus type 1 in Zhejiang Province, China

## Abstract

The epidemic characteristics of human immunodeficien

In [18]:
from anthropic import Anthropic
client = Anthropic(api_key=claude_api_key)

response = client.messages.create(
    model="claude-3-haiku-20240307", # claude-3-haiku-20240307
    max_tokens=4096,
    messages=[
        {"role": "user", "content": prompt_60}
    ]
)

answer = response.content[0].text

# --- Step 1: Token usage ---
input_tokens = response.usage.input_tokens
output_tokens = response.usage.output_tokens
total_tokens = input_tokens + output_tokens

print(f"Input tokens: {input_tokens}")
print(f"Output tokens: {output_tokens}")
print(f"Total tokens: {total_tokens}")

# --- Step 2: Pricing (Claude Opus 4.1 as of 2025-09) ---
# Input: $15.00 per 1M tokens = $0.000015 per token
# Output: $75.00 per 1M tokens = $0.000075 per token
input_cost = input_tokens * 0.000015
output_cost = output_tokens * 0.000075
total_cost = input_cost + output_cost

print(f"Estimated cost: ${total_cost:.4f}")


Input tokens: 5262
Output tokens: 3866
Total tokens: 9128
Estimated cost: $0.3689


In [19]:
print(answer)
output_file = Path(f"{tmp_pmid}_60_questions_answer.txt")
with output_file.open("w", encoding="utf-8") as f:
    f.write(answer)

Here are the answers to the questions:

Question: 1101

Evidence: The paper states that "The epidemic characteristics of human immunodeficiency virus type 1 (HIV-1) in Zhejiang Province have not been systematically identified." This suggests that the data reported in the paper are previously unpublished.

Rationale: The paper presents new data on the epidemic characteristics and molecular epidemiology of HIV-1 in Zhejiang Province, China, which has not been systematically studied before.

Answer: Yes

Question: 1102

Evidence: The paper states that "Totally, 332 *gag* and 229 *pol* gene fragments were amplified and sequenced from the 451 individual samples, respectively."

Rationale: The paper reports the sequencing of HIV-1 gag and pol gene fragments from patient samples.

Answer: Yes

Question: 1103 

Evidence: There is no mention of in vitro passage experiments in the paper.

Rationale: The paper does not report any results from in vitro passage experiments.

Answer: No

Question: 1

In [None]:


import re
# --- Step 2: Split Claude’s output into blocks ---
# Matches the format:
# Question: ...
# Evidence: ...
# Rationale: ...
# Answer: ...
pattern = r"Question:\s*(.*?)\s*Evidence:\s*(.*?)\s*Rationale:\s*(.*?)\s*Answer:\s*(.*?)(?=\n---|\nQuestion:|$)"

matches = re.findall(pattern, answer, re.DOTALL)

# --- Step 3: Build dataframe ---
records = []

for i, (q, ev, rat, ans) in enumerate(matches):
    # strip extra whitespace
    q, ev, rat, ans = q.strip(), ev.strip(), rat.strip(), ans.strip()

    # match to QID + original Question
    if i < len(combined_list):
        id_str, question_str = combined_list[i].split(": ", 1)
        records.append({
            "PMID": tmp_pmid,
            "QID": id_str,
            "Question": question_str,
            "Evidence": ev,
            "Rationale": rat,
            "Answer": ans
        })

output_df = pd.DataFrame(records)

# --- Step 4: Save to Excel ---
output_df.to_excel(f"claude_answers_{tmp_pmid}_60.xlsx", index=False)


I'll analyze the paper and answer each question systematically.

**Question: 1101**

Evidence: The study was conducted in a cross-sectional and anonymous manner in all 11 cities, based on the surveillance of molecular epidemiology or drug resistance. A total of 451 HIV-1-positive patient samples were collected by local CDC staff from 2004 to 2008.

Rationale: The paper describes original research conducted by the authors, collecting and analyzing 451 HIV-1-positive patient samples from 2004 to 2008 in Zhejiang Province. This represents new data collection and analysis rather than a review of existing literature.

Answer: Yes

---

**Question: 1102**

Evidence: Totally, 332 *gag* and 229 *pol* gene fragments were amplified and sequenced from the 451 individual samples, respectively (Supplementary files 1 and 2).

Rationale: The paper explicitly states that HIV gene fragments (gag and pol) were amplified and sequenced from patient samples, confirming that HIV sequences were reported.

An