In [None]:
!pip install openai

Test the API link

In [None]:
from openai import OpenAI
client = OpenAI(
    base_url='xxxxxxx',
    api_key='xxxxxxx'
)
completion = client.chat.completions.create(
  model="gpt-4-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
  ]
)
print(completion.choices[0].message)

Test the API speed

In [None]:
import time
from openai import OpenAI

client = OpenAI(
    base_url='xxxxxxx',  
    api_key='xxxxxxx'
)

start = time.time()
response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {"role": "user", "content": "Does this abstract mention AI methods: 'This paper uses machine learning...'"}
    ]
)
end = time.time()

print(response.choices[0].message.content)
print(f"Time used: {end - start:.2f} seconds")


Binary classification -  Determine the employment of AI methods based on publication abstracts.

In [None]:
import pandas as pd
import time
from io import StringIO
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError

# Initialize API client
client = OpenAI(
    base_url='xxxxxxx',
    api_key='xxxxxxx'
)

# Read file and remove null characters
input_csv = "D:\\Resilience\\chatgpt test\\LR_clean.csv"
with open(input_csv, "r", encoding="utf-8", errors="replace") as f:
    content = f.read().replace('\x00', '')


df = pd.read_csv(StringIO(content), low_memory=False)


batch_size = 5             
timeout_sec = 20           
max_retries = 2           
chunk_size = 1000          
start_row = 1         
end_row = 10001            

# Classification function
def classify_ai_use(idx, abstract_text):
    if not abstract_text or str(abstract_text).lower() == "nan":
        return idx, 0

    prompt = f"""
You are a scientific abstract classifier.

Determine whether the following abstract **mentions the use of artificial intelligence (AI) methods**. AI methods include, but are not limited to:

- Traditional machine learning (e.g., decision trees, support vector machines, k-nearest neighbors, logistic regression, random forest, XGBoost)
- Deep learning (e.g., convolutional neural networks, recurrent neural networks, transformers, BERT, GPT, neural networks)
- Natural language processing (e.g., word embeddings, text classification, language models)
- AI tools/libraries (e.g., scikit-learn, TensorFlow, PyTorch, Keras)

### Examples:
1. "This study uses convolutional neural networks to classify satellite images..." → AI_use: 1  
2. "We employed logistic regression and random forest to analyze data..." → AI_use: 1  
3. "This work evaluates environmental impacts using descriptive statistics and linear regression..." → AI_use: 0  

### Now classify the following abstract.
Respond with **only** "AI_use: 1" or "AI_use: 0".

Abstract: {abstract_text} 
"""

    for attempt in range(max_retries + 1):
        try:
            response = client.chat.completions.create(
                model="gpt-4-turbo",
                messages=[
                    {"role": "system", "content": "You are an AI model classifying whether abstracts mention AI."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0
            )
            result = response.choices[0].message.content.strip()
            ai_use = 1 if "AI_use: 1" in result else 0
            print(f"Row {idx}: Success on attempt {attempt+1}")
            return idx, ai_use
        except Exception as e:
            print(f"Row {idx}: Error on attempt {attempt+1}: {e}")
            time.sleep(1)
    return idx, "ERROR"

# Loop through data in chunks
for chunk_start in range(start_row, end_row, chunk_size):
    chunk_end = min(chunk_start + chunk_size, end_row)
    df_subset = df.iloc[chunk_start:chunk_end].copy().reset_index(drop=True)
    ai_use_list = [None] * len(df_subset)

    print(f"\nProcessing chunk {chunk_start} to {chunk_end - 1}...")

    for i in range(0, len(df_subset), batch_size):
        batch = df_subset.iloc[i:i+batch_size]
        with ThreadPoolExecutor(max_workers=batch_size) as executor:
            futures = {
                executor.submit(classify_ai_use, idx, str(row["Abstract"]).strip()): idx
                for idx, row in batch.iterrows()
            }

            for future in as_completed(futures, timeout=batch_size * timeout_sec):
                idx = futures[future]
                try:
                    idx_result, result = future.result(timeout=timeout_sec)
                    if 0 <= idx_result < len(ai_use_list):
                        ai_use_list[idx_result] = result
                except TimeoutError:
                    print(f"Row {idx}: Timeout")
                    if 0 <= idx < len(ai_use_list):
                        ai_use_list[idx] = "TIMEOUT"
                except Exception as e:
                    print(f"Row {idx}: Unknown error: {e}")
                    if 0 <= idx < len(ai_use_list):
                        ai_use_list[idx] = "ERROR"

        print(f"Batch {i}-{i+len(batch)-1} completed.")

    # Add results column and save to file
    df_subset["AI_use"] = ai_use_list
    output_path = f"D:/Resilience/02Outputdataset/output_chatgpt_{chunk_start}-{chunk_end - 1}.csv"
    df_subset.to_csv(output_path, index=False)
    print(f"File saved: {output_path}")

print("All processing completed.")
