In [9]:
# 一通り動かしてみる

#ライブラリの自動インポート
%reload_ext autoreload
%autoreload 2

In [10]:
import pandas as pd
import openai
from tqdm import tqdm
from src.utils.GPT import json_generate
from src.qra_gen import generate_reason_and_predict
import os

In [11]:
#api_key
openai.api_key =os.environ["OPENAI_API_KEY"]

In [12]:
#laod dataset as dict
csv_path="dataset/BradleyMeltingPointDataset_practice.csv"
df=pd.read_csv(csv_path)
chemical_records=df.to_dict(orient='records')
chemical_records[:1]

[{'mpC': 152.0,
  'name': '3-phthalimidopropionic acid',
  'smiles': 'c1ccc2c(c1)C(=O)N(C2=O)CCC(=O)O',
  'csid': 69310,
  'link': 'http://www.alfa.com/en/GP100W.pgm?DSSTK=L13535',
  'source': 'Alfa Aesar',
  'Value': 425.0}]

In [17]:
#このプロンプトを変更

# 質問文を生成するための指示
qa_gen_command="""
Provide the quantitative reasons within 300 words so that a scientist, who does not know the melting point, can predict the value.
We must quantitatively consider how the melting point shifts.
I absolutely forbid you to make qualitative generalizations.

#Bad example reasons
## Its molecular weight compared to simpler aromatic compounds, contributing to a higher melting point. (qualitative discussion is practically meaningless!!!)
## Therefore, the compound has a melting point of 110°C (Never include the answer in the reason!!).

#Good example reasons
## Benzene has a boiling point of 80 degrees. Methyl group in toluene improves the value by about +30 degrees due to its larger molecular weight.
## Butane has a boiling point of -1°C, Hydroxy group in butanol will increase the value about +115°C due to the hydrogen bonding.

#Output: Reason key
"""

# 分子構造から物性を予測するための指示
predict_command="""
Predict the melting point [°C] of the following compound.
In any case, only output some integer value.

#Good examples
## 104
## -29

#Output: Value key
"""

#use GPT3.5 completion
llm_ask_func=json_generate

#GPT4
#def llm_ask_func(prompt,model="gpt-4-1106-preview"):
#    return json_generate(prompt,model)

#if gen_reason=True, generate reason and predict
#if gen_reason=False, only predict without reason (for control)
gen_reason=True
#gen_reason=False

#n回のpredictionを行う
n_trials=1

In [20]:

completed_records=[]
for record in tqdm(chemical_records):
    try:
        completed_record=generate_reason_and_predict(record["name"],record["mpC"],
                                        qa_gen_command,
                                        predict_command,
                                        llm_ask_func=json_generate,
                                        smiles=record["smiles"],
                                        gen_reason=gen_reason,
                                        n_trials=n_trials)
    except Exception as e:
        print(e)
        completed_record=record

    completed_records.append(completed_record)
    #break

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:09<?, ?it/s]


In [21]:
completed_record

{'name': '3-phthalimidopropionic acid',
 'value': 152.0,
 'smiles': 'c1ccc2c(c1)C(=O)N(C2=O)CCC(=O)O',
 'generated_reason': 'Phthalimide has a melting point of 131-134 °C. The addition of a propionic acid group is expected to increase the melting point due to the larger molecular weight and potential for intermolecular interactions, shifting the value by approximately +18-21 °C.',
 'predicted_values': [152]}

In [16]:
#save
import datetime
import json

study_name="1216_practice10"
now=datetime.datetime.now()
now_str=now.strftime("%Y%m%d_%H%M%S")
save_path=f"results/{study_name}_{now_str}.json"
with open(save_path, 'w') as f:
    json.dump(completed_records, f, indent=4, ensure_ascii=False)