**Library Imports**

In [73]:
import json
import pandas as pd
from openai import OpenAI
import numpy as np

**Excel Empty Row Cleaning**

In [None]:
import pandas as pd

df = pd.read_excel('All_Files.xlsx')
df_cleaned = df.dropna(subset=['Abstract'])
df_cleaned.to_excel('cleaned_file.xlsx', index=False)



**Request LLM**

In [None]:
def send_request(filename):
  client = OpenAI()

  batch_input_file = client.files.create(
    file=open(filename, "rb"),
    purpose="batch"
  )

  batch_input_file_id = batch_input_file.id

  batch = client.batches.create(
      input_file_id=batch_input_file_id,
      endpoint="/v1/chat/completions",
      completion_window="24h"
  )
  return batch

def check_batch_status(batch):
  client = OpenAI()
  retrieve_batch = client.batches.retrieve(batch.id)
  return retrieve_batch

def download_output(retrieve_batch, filename):
  client = OpenAI()
  content = client.files.content(retrieve_batch.output_file_id)
  with open(filename, "wb") as f:
      f.write(content.content)
      
def get_answer_list(filepath):
    content_list, custom_id_digits = [], []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            try:
                choices = data['response']['body']['choices']
                for choice in choices:
                    message = choice['message']
                    if message['role'] == 'assistant':
                        content = message['content']
                        content_list.append(content)
                        
                        custom_id = int(data['custom_id'])
                        custom_id_digits.append(custom_id)
            except (KeyError, IndexError, ValueError):
                continue
    df = pd.DataFrame({'ID': custom_id_digits,'Answer': content_list}).sort_values(by='ID').reset_index(drop=True)
    return df

**Prompt Engineering**

In [None]:
test_research = pd.read_excel('cleaned_file.xlsx', sheet_name='Sheet1')
text_list = test_research['Abstract']
file_path1 = 'judge2_abstract1'
file_path2 = 'judge2_abstract2'
file_path3 = 'judge2_abstract3'
q1 = '''
Respond in JSON format to each of the following questions only based on the text:
1.Based on the information in the text,please determine whether it contains information on life cycle assessment(life-cycle assessment),LCA,environmental impact(problems),or calculation of carbon(greenhouse gas,GHG) emissions(footprint)?
(If it contains relevant information, answer"1";if it has words that have the same meaning as"LCA studies are expected" or "LCA studies will be carried out in the future",answer"0";if it does not contain relevant information or you are not sure, answer "0")
2.Based on the information in the text,whether it contains the following words:resins or resin)?
(If it contains resins or specific types of resins(such as PE,PP,PS,PVC,PET,ABS,PLA,etc.),answer "1";if it does not contain relevant information or you are not sure, answer "0")
3.Based on the information in the text,whether it contains polymers or polymer?
(Do a word-by-word comparison of the text,if the word "polymer" or"polymers" appear(whether or not it's relevant to the topic), answer "1";if it does not contain relevant information or you are not sure, answer "0")
4.Based on the information in the text,whether it contains plastics(plastic) or waste?
(Do a word-by-word comparison of the text,if the word “plastic”,"plastics","package","packaging" appear(whether or not it's relevant to the topic), answer "1";if it does not contain relevant information or you are not sure, answer "0")
Here is an example of the output format:
{
"article_check1": “1”,
"article_check2": "1",
"article_check3": "0",
"article_check4": "0",
}
'''
q2='''
Respond in JSON format to each of the following questions only based on the text:
1.Based on the information in the text,please determine whether it contains information on polymer resins made into fiber products?
(If it contains resin synthetic fibers(such as polypropylene synthetic fibers,etc.), answer"1";if it contains information about fiber-reinforced polymers (such as glass fiber reinforced polymers(GFRP),carbon fiber reinforced polymer(CFRP), fiber reinforced polymer(FRP), etc.)or resin matrix,answer"0";if it contains carbon fiber(fibers) or glass fiber(fibers) and resin(resins) in one text,answer"0";if it contains plastic waste,recycling(circular) and fibers at the same time,answers"0";if it does not contain relevant information or you are not sure, answer "0")
2.Based on the information in the text,please determine whether it contains information on lignin,cellulose or metal polymers(metal-polymer)?
(If it contains lignin,cellulose or metal polymers,and contains terms ('resins'(such as PE,PP,PS,PVC,PET,ABS,PLA,etc.) ,or 'plastics'), answer"0";If it contains lignin,cellulose or metal polymers and does not mention resins or plastics, answer"1";if it does not contain lignin,cellulose,metal polymers,or you are not sure, answer "0")
Here is an example of the output format:
{
"article_check1":"0",
"article_check2":"0",
}
'''
q3='''
Respond in JSON format to each of the following questions only based on the text:
1.Based on the information in the text,please determine whether it contains information on resins from trees (such as pine resin or pine chemical products)?
(If it contains resins from trees (such as pine resin or pine chemical products), answer"1";if it contains electrode resin(resins),answer"0";if it does not contain relevant information or you are not sure, answer "0")
Here is an example of the output format:
{
"article_check1:"0",
}
'''

**API Invocation**

In [None]:
request_list1 = []
request_list2 = []
request_list3 = []
for i, text in enumerate(text_list): 
    request1 = {"custom_id": str(i), "method": "POST", "url": "/v1/chat/completions"}
    body1 = {"model": "gpt-4-turbo", "temperature": 0, "response_format": {"type": "json_object"}}
    body1['messages'] = [
        {"role": "system", "content": q1},
        {"role": "user", "content": text}]
    request1['body'] = body1
    request_list1.append(request1)

    request2 = {"custom_id": str(i), "method": "POST", "url": "/v1/chat/completions"}
    body2 = {"model": "gpt-4-turbo", "temperature": 0, "response_format": {"type": "json_object"}}
    body2['messages'] = [
        {"role": "system", "content": q2},
        {"role": "user", "content": text}]
    request2['body'] = body2
    request_list2.append(request2)

    request3 = {"custom_id": str(i), "method": "POST", "url": "/v1/chat/completions"}
    body3 = {"model": "gpt-4-turbo", "temperature": 0, "response_format": {"type": "json_object"}}
    body3['messages'] = [
        {"role": "system", "content": q3},
        {"role": "user", "content": text}]
    request3['body'] = body3
    request_list3.append(request3)
    
with open(file_path1 + '.jsonl', 'w') as file:
    for entry in request_list1:
        json_line = json.dumps(entry)
        file.write(json_line + '\n')
        
batch1 = send_request(file_path1 + '.jsonl')
print(batch1)

with open(file_path2 + '.jsonl', 'w') as file:
    for entry in request_list2:
        json_line = json.dumps(entry)
        file.write(json_line + '\n')

batch2 = send_request(file_path2 + '.jsonl')
print(batch2)

with open(file_path3 + '.jsonl', 'w') as file:
    for entry in request_list3:
        json_line = json.dumps(entry)
        file.write(json_line + '\n')

batch3 = send_request(file_path3 + '.jsonl')
print(batch3)

Batch(id='batch_6714b8a4c85881908825d66cd0d284bc', completion_window='24h', created_at=1729411236, endpoint='/v1/chat/completions', input_file_id='file-IikAsKlN9swTMMZ2mgCaJ1NX', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1729497636, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
Batch(id='batch_6714b8d247348190a983da2310bec21d', completion_window='24h', created_at=1729411282, endpoint='/v1/chat/completions', input_file_id='file-IiqNkN2K82f5V5vFhlwLj4Cy', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1729497682, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed

**Check Batch Task Status**

In [94]:
retrieve_batch1 = check_batch_status(batch1)
retrieve_batch1.request_counts

BatchRequestCounts(completed=2396, failed=0, total=2410)

In [89]:
retrieve_batch2 = check_batch_status(batch2)
retrieve_batch2.request_counts

BatchRequestCounts(completed=2410, failed=0, total=2410)

In [95]:
retrieve_batch3 = check_batch_status(batch3)
retrieve_batch3.request_counts

BatchRequestCounts(completed=2400, failed=0, total=2410)

**Return Task Results and Save as JSON**

In [70]:
#print(retrieve_batch.output_file_id)
download_output(retrieve_batch1, file_path1 + "_output.jsonl")
answer_df1 = get_answer_list(file_path1 + "_output.jsonl")
download_output(retrieve_batch2, file_path2 + "_output.jsonl")
answer_df2 = get_answer_list(file_path2 + "_output.jsonl")
download_output(retrieve_batch3, file_path3 + "_output.jsonl")
answer_df3 = get_answer_list(file_path3 + "_output.jsonl")

**Result Processing and Storage**

In [71]:
test_research.insert(len(test_research.columns), column='reserve', value=None)
#test_research.insert(len(test_research.columns), column='data-rich', value=None)
test_research.insert(len(test_research.columns), column='answer1', value=None)
test_research.insert(len(test_research.columns), column='answer2', value=None)
test_research.insert(len(test_research.columns), column='answer3', value=None)
#question_df = pd.DataFrame(columns=['ID', 'content'])

In [None]:
for (i, row1), (_, row2),(_, row3) in zip(answer_df1.iterrows(),answer_df2.iterrows(), answer_df3.iterrows()):
    id = int(row1['ID'])
    answer1 = json.loads(row1['Answer'])
    answer2 = json.loads(row2['Answer']) 
    answer3 = json.loads(row3['Answer'])
    a1 = []
    a2 = []
    a3 = []
    for a_name in answer1:
        a1.append(answer1[a_name])
    for a_name in answer2:
        a2.append(answer2[a_name])
    for a_name in answer3:
        a3.append(answer3[a_name])
    test_research.loc[id, 'answer1'] = str(a1)
    test_research.loc[id, 'answer2'] = str(a2)
    test_research.loc[id, 'answer3'] = str(a3)
    #print(a1,a2,a3)
    if '1' in a1[0].lower() and ('1' in a1[1].lower() or '1' in a1[2].lower() or '1' in a1[3].lower() ) and '0' in a2[0].lower() and '0' in a2[1].lower() and '0' in a3[0].lower() :
        test_research.loc[i, 'reserve'] = 'Y' 
        #print(111)
#print(test_research['answer2'])
test_research.to_excel('result2(all).xlsx', index=False)