In [1]:
#再帰的に構造ー物性相関データセットから理由を生成させる
#バッチ処理

#ライブラリの自動インポート
%reload_ext autoreload
%autoreload 2

In [2]:
import openai
from tqdm import tqdm
import pandas as pd
import os
import glob
import json
import copy

In [3]:

openai.api_key =os.environ["OPENAI_API_KEY"]

In [4]:
#laod dataset as dict
csv_path="dataset/BradleyMeltingPointDataset_clean.csv"
df=pd.read_csv(csv_path)
chemical_records=df.to_dict(orient='records')
chemical_records[:1]

[{'mpC': 87.0,
  'name': '((5-((Diphenylphosphino)methyl)-2,2-dimethyl-1,3-dioxolan-4-yl)methyl)(diphenyl)phosphine',
  'smiles': 'P(CC1OC(OC1CP(c1ccccc1)c1ccccc1)(C)C)(c1ccccc1)c1ccccc1',
  'csid': 109291,
  'link': 'http://dx.doi.org/10.1021/ci0500132',
  'source': 'Karthikeyan M.; Glen R.C.; Bender A. General melting point prediction based on a diverse compound dataset and artificial neural networks. J. Chem. Inf. Model.; 2005; 45(3); 581-4277'}]

In [5]:
model="gpt-4-1106-preview"

In [6]:
system_prompt="""
Provide the quantitative Reason and Prediction so that a scientist, who does not know the melting point, can predict the value.

#Commands
- You must quantitatively consider how the melting point shifts, focusing on each functional groups.
- Actual value and Prediction must match each other.
- If Actual value and Prediction differ each other, rethink Reason.
- If Prediction does not contain numbers for each functional group effect, rethink Reason

#Example reason
- Target compound: Toluene
- Basic unit, benzene has a boiling point of 80.
- Methyl group: +30 (due to larger molecular weight)
- Prediction: 110

"""

In [7]:
def gen_prompt(chemical_record,reason="",prediction=""):
    name=chemical_record["name"]
    smiles=chemical_record["smiles"]
    value=chemical_record["mpC"]
    prompt=f"""
#Data
-Name: {name}
-SMILES: {smiles} 
-Actual value: {value}
-Reason: {reason}
-Prediction: {prediction}

#Output (JSON keys)
- Reason, Prediction
"""
    return prompt


In [8]:
import json

#ask gpt
def json_generate(prompt,model="gpt-3.5-turbo-1106"):
    response = openai.chat.completions.create(
    model=model,
    messages=[
        {
            "role": "system",
            "content": system_prompt,
        },
        {
            "role": "user",
            "content": f"""{prompt}"""
        }  
    ],
    response_format={ "type": "json_object" }
    )

    return (json.loads(response.choices[0].message.content))


#parse prediction
def prediction_string_to_number(prompt,model="gpt-3.5-turbo-1106"):
    response = openai.chat.completions.create(
    model=model,
    messages=[
        {
            "role": "system",
            "content": """Extract integer from prediction. Use average if multiple numbers are included.
            Examples:
            In: 70.2 - 75.2 degrees Celsius
            Out: 73
            In: 75.2 degrees Celsius
            Out: 73
            In: For 1-naphthalenecarboxaldehyde, starting with the base value for naphthalene with a melting point of 80\u00b0C and subtracting the estimated aldehyde effect of approximately -47 to -50\u00b0C, the predicted melting point would be in the range of 30-33\u00b0C.
            Out: 32
            """,
        },
        {
            "role": "user",
            "content": f"""{prompt}
"#Output (JSON keys)
- Prediction"""
        }  
    ],
    response_format={ "type": "json_object" }
    )

    return (json.loads(response.choices[0].message.content))

In [9]:
#t=prediction_string_to_number("Considering a starting point of 80\u00b0C for naphthalene and accounting for the influence of the aldehyde functional group, which can reduce the melting point by 47 to 50\u00b0C, the estimated melting point for 1-naphthalenecarboxaldehyde is around 30 to 33\u00b0C, closely aligning with the actual value of 33.5\u00b0C.")
#t

In [10]:

save_base_path="dataset/231225AutoReasoning/"


#load finished records
gen_records={}
gen_json_path_list=glob.glob(save_base_path+"*.json")
for gen_json_path in tqdm(gen_json_path_list):
    with open(gen_json_path) as f:
        gen_hist=json.load(f)
    gen_records[gen_hist[0]["name"]]=gen_hist

100%|██████████| 9/9 [00:00<00:00, 29817.33it/s]


In [11]:
import re

def remove_non_alphabet_characters(s):
    # Using regex to remove all non-alphabet characters
    return re.sub('[^a-zA-Z]', '', s)

In [12]:
n_recursion=2
n_random_repeat=3
error_threshold=10

In [13]:
#batch 
for chemical_record in tqdm(chemical_records):

    #load record
    gen_record=copy.deepcopy(chemical_record)

    #skip if already generated
    if gen_record["name"] in gen_records:
        print(f"Skip because already generated: {gen_record['name']}")
        continue

    record_history=[]

    fin_flag=False
    #make suggestion with random seed
    for j in range(n_random_repeat):
        if fin_flag:
            break

        gen_record["Reason"]=""
        gen_record["Prediction"]=""
        if j==0:
            record_history.append(copy.deepcopy(gen_record))

        #improve reasoing
        for i in range(n_recursion):
            r=json_generate(
                gen_prompt(gen_record,
                        reason=gen_record["Reason"],
                        prediction=gen_record["Prediction"]
                ),
                model=model,
            )
            #parse prediction string to number
            gen_record.update(r)
            try:
                gen_record["Prediction(integer)"]=float(prediction_string_to_number(gen_record["Prediction"])["Prediction"])
            except:
                gen_record["Prediction(integer)"]=99999
            record_history.append(copy.deepcopy(gen_record))
            
            #finish reasoning if prediction is close to actual value
            if abs(gen_record["Prediction(integer)"]-gen_record["mpC"])<=error_threshold:
                fin_flag=True
                print(f"Finished because good reasoning was achieved: {gen_record['name']}")
                break

    #save
    save_name=remove_non_alphabet_characters(gen_record["name"])
    save_path=save_base_path+f"{save_name}.json"
    with open(save_path, 'w') as f:
        json.dump(record_history, f, indent=4)

    gen_records[gen_record["name"]]=record_history

  0%|          | 1/24889 [01:39<685:12:18, 99.11s/it]

Finished because good reasoning was achieved: ((5-((Diphenylphosphino)methyl)-2,2-dimethyl-1,3-dioxolan-4-yl)methyl)(diphenyl)phosphine


  0%|          | 2/24889 [02:21<455:02:07, 65.82s/it]

Finished because good reasoning was achieved: (+)-alpha-pinene


  0%|          | 3/24889 [02:45<321:37:31, 46.53s/it]

Finished because good reasoning was achieved: (+)-camphene


  0%|          | 4/24889 [02:57<229:27:30, 33.19s/it]

Finished because good reasoning was achieved: (+)-fenchol


  0%|          | 5/24889 [04:18<348:23:36, 50.40s/it]

Finished because good reasoning was achieved: (+)-isomenthol


  0%|          | 7/24889 [08:40<595:14:40, 86.12s/it] 

Finished because good reasoning was achieved: (+-)camphor


  0%|          | 8/24889 [09:03<455:36:24, 65.92s/it]

Finished because good reasoning was achieved: (+/+)-ibuprofen


  0%|          | 9/24889 [12:54<812:20:17, 117.54s/it]

Finished because good reasoning was achieved: (+/+)-indoprofen


  0%|          | 11/24889 [16:20<741:42:39, 107.33s/it]

Finished because good reasoning was achieved: (-)-2-amino-3-phenyl-1-propanol


  0%|          | 12/24889 [16:49<575:36:01, 83.30s/it] 

Finished because good reasoning was achieved: (-)-beta-caryophyllene epoxide


  0%|          | 13/24889 [17:32<490:53:33, 71.04s/it]

Finished because good reasoning was achieved: (-)-levonorgestrel


  0%|          | 14/24889 [17:47<374:42:26, 54.23s/it]

Finished because good reasoning was achieved: (-)-menthone


  0%|          | 15/24889 [18:14<317:01:08, 45.88s/it]

Finished because good reasoning was achieved: (1'S,3aR,4R,6'S,7aR,9'R,10'S)-7a,10'-Dimethyltetrahydro-1H,11'H-spiro[2-benzofuran-4,5'-[3]oxatricyclo[7.2.1.0~1,6~]dodecane]-2',3,11'(3aH)-trione


  0%|          | 16/24889 [21:06<580:45:55, 84.06s/it]

Finished because good reasoning was achieved: (1,1'-binaphthalene)-2,2'-diol


  0%|          | 17/24889 [21:57<511:59:32, 74.11s/it]

Finished because good reasoning was achieved: (1,1'-biphenyl)-2,5-diol, diacetate


  0%|          | 18/24889 [22:04<372:46:03, 53.96s/it]

Finished because good reasoning was achieved: (1,1'-biphenyl)-4,4'-dicarbonitrile


  0%|          | 19/24889 [22:40<333:58:35, 48.34s/it]

Finished because good reasoning was achieved: (1,1'-biphenyl)-4-ol, 3-amino-


  0%|          | 20/24889 [24:02<404:11:20, 58.51s/it]

Finished because good reasoning was achieved: (1,2,2,3-tetramethylcyclopentyl)methanol


  0%|          | 21/24889 [26:00<527:01:49, 76.30s/it]

Finished because good reasoning was achieved: (1,2,2,3-tetramethylcyclopentyl)methyl 4,7,7-trimethyl-3-oxobicyclo[2.2.1]heptane-2-carboxylate


In [None]:
#値段の概算
t=gen_prompt(gen_record,
               reason=gen_record["Reason"],
               prediction=gen_record["Prediction"]
    )
user_len=len(t.split(" "))
system_len=len(system_prompt.split(" "))

input_cost=0.01/1000*(user_len+system_len)

gen_len=len(gen_record["Reason"].split(" "))+len(gen_record["Prediction"].split(" "))
output_cost=0.03/1000*gen_len

n_trials=2
cost=n_trials*(input_cost+output_cost)
print(f"Input tokens: {user_len+system_len}")
print(f"Output tokens: {gen_len}")
print(f"Cost: {cost} USD")
print(f"Cost: {cost*150} JP")

Input tokens: 298
Output tokens: 205
Cost: 0.01826 USD
Cost: 2.739 JP
