In [None]:
import os
import json
import time
import requests
import pandas as pd
from openai import OpenAI
import re


DATASET = 'data/abstracts-500'
GET_DOI = False

MODEL_VERSION = 'deepseek-chat' # DeepSeek-V3

PROMPT = 'v1_prompt'

instruction_file = f'./{PROMPT}.txt'
dataset_file = f'../{DATASET}.xlsx'
output_file = f'./results/deepseek_2025/{DATASET}__{PROMPT}__output.xlsx'


# https://platform.openai.com/docs/pricing
# The unit of the numbers is American Dollars ($)
input_tokens_unit_price = 15.00 / 1e6
output_tokens_unit_price = 60.00 / 1e6

In [None]:
client = OpenAI(api_key='<API-HERE>', base_url="https://api.deepseek.com")

df = pd.read_excel(dataset_file)

with open(instruction_file, 'r') as file:
    instructions = file.read()

# Print the instructions that will be fed to ChatGPT
print(instructions)

In [None]:
def clean_json_block(text):
    # Remove ```json or ``` or similar at start/end
    return re.sub(r"^```[a-z]*\s*|\s*```$", "", text.strip(), flags=re.IGNORECASE)

In [None]:
def classify_paper(paper):
    completion = client.chat.completions.create(
        model=MODEL_VERSION,
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": paper}
        ],
        temperature=0.0,
        stream=False
    )

    response_content = completion.choices[0].message.content.strip()

    # Retrieve the token usage from the response
    usage = completion.usage
    prompt_tokens = usage.prompt_tokens
    completion_tokens = usage.completion_tokens

    # Print out token usage details
    # print(f"Prompt tokens: {prompt_tokens}")
    # print(f"Completion tokens: {completion_tokens}")

    total_cost = prompt_tokens * input_tokens_unit_price + completion_tokens * output_tokens_unit_price

    # print(f"Raw API Response: {response_content}")

    return clean_json_block(response_content), total_cost

In [None]:
def update_output_file(output_file, new_row):
    
    columns = new_row.columns

    # Define the columns for the DataFrame
    # columns = ["Paper ID", "Reviewers verdict", "ChatGPT verdict", "Reviewer 1", "Reviewer 2", "Reviewers notes", "ChatGPT classification explanation", "ChatGPT classification confidence", "Title", "Authors", "Abstract"]
    
    # Check if the file exists
    if not os.path.exists(output_file):
        # Create a new DataFrame with the headers and save it to a new Excel file
        df = pd.DataFrame(columns=columns)

        # Append the new row to the DataFrame using concat
        df = pd.concat([df, new_row], ignore_index=True)
        
        df.to_excel(output_file, index=False)
        print(f"Created new file: {output_file} and added the first row.")
    else:
        # Load the existing file into a DataFrame
        df = pd.read_excel(output_file)

        # Append the new row to the DataFrame using concat
        df = pd.concat([df, new_row], ignore_index=True)
    
        # Save the updated DataFrame back to the Excel file
        df.to_excel(output_file, index=False)
        # print("New row added to the existing file.")

In [None]:
def get_doi_from_title(title):
    url = "https://api.crossref.org/works"
    params = {
        "query.title": title,
        "rows": 1  # Return only the most relevant result
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        items = data.get("message", {}).get("items", [])
        
        if items:
            # Return the DOI of the first matched paper
            return items[0].get("DOI", "null")
        else:
            return "null"
    else:
        return f"Error: {response.status_code}"

In [None]:
total_cost = 0

true_positives = 0
true_negatives = 0
false_positives = 0
false_negatives = 0

# Record the start time
start_time = time.time()
total_rows_number = len(df)


# Iterate over the rows and print the content
for index, row in df.iterrows():

    r = row.to_dict()
    paper = f"Authors: {r['AUTHOR']}\n\nTitle: {r['TITLE']}\n\nAbstract: {r['ABSTRACT']}"
    # print(f"Paper ID {r['ID']}: {r['Title']}\n")
    print(f"Screening paper {index + 1}/{total_rows_number}")

    # Attempt to classify the paper and handle potential issues
    try:
        response_content, cost = classify_paper(paper)
        # Attempt to parse the JSON response
        print(f"Response content: {response_content}")

        answer_dict = json.loads(response_content)
        total_cost += cost
        
    except json.JSONDecodeError as e:
        # print(f"Failed to decode JSON response for paper ID {r['ID']}: {e}")
        # continue  # Skip this row and move to the next one if JSON parsing fails
        raise RuntimeError(f"Failed to decode JSON response for paper ID {r['ID']}: {e}")
    
    new_row = pd.DataFrame([{
        "Paper ID": r['ID'],
        "Reviewers verdict": r['Include/exclude'],
        "DeepSeek verdict": answer_dict['verdict'],
        "Verdicts agreement": "Y" if r['Include/exclude'] == answer_dict['verdict'] else "N",
        "DeepSeek classification explanation": answer_dict['explanation'],
        "DeepSeek classification confidence": answer_dict['confidence'],
        "Title": r['TITLE'],
        "Authors": r['AUTHOR'],
        "Abstract": r['ABSTRACT']
    }])

    update_output_file(output_file, new_row)

    if r['Include/exclude'] == answer_dict['verdict'] == "include":
        true_positives += 1
    elif r['Include/exclude'] == answer_dict['verdict'] == "exclude":
        true_negatives += 1
    elif r['Include/exclude'] == "include" and answer_dict['verdict'] == "exclude":
        false_negatives += 1
    elif r['Include/exclude'] == "exclude" and answer_dict['verdict'] == "include":
        false_positives += 1
        

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

print(f"\n\nThis paper screening took {elapsed_time:.1f} seconds and had a cost of {total_cost:.3f} $")
print(f"The screening included {true_positives + true_negatives + false_positives + false_negatives} papers, out of which {true_positives + true_negatives} were correctly classified")
print(f"True positives: {true_positives}")
print(f"True negatives: {true_negatives}")
print(f"False positives: {false_positives}")
print(f"False negatives: {false_negatives}")
