In [19]:
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from numpy.linalg import norm
import random
import os
import openai
from openai import OpenAI
import csv
from neo4j import GraphDatabase
from dotenv import load_dotenv 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [9]:
train_set_path = "train_set.csv"
premodel_path = "premodel_S.pth"
custom_env_path = "KEY.env"

In [10]:
# Retrieve the API key and base URL from environment variables
if load_dotenv(dotenv_path=custom_env_path):
    print(f" has successfully loaded the environment variable from '{custom_env_path}'." )
else:
    print(f" Warning: The environment variable failed to be loaded from '{custom_env_path}'. Please check whether the file exists and the path is correct.")

api_key = os.getenv("OPENAI_API_KEY")
base_url = os.getenv("OPENAI_BASE_URL")
neo4j_url = os.getenv("neo4j_url")
neo4j_username = os.getenv("neo4j_username")
neo4j_password = os.getenv("neo4j_password")

# Check if the environment variables were loaded correctly (optional but good practice)
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in '{custom_env_path}' file or environment variables.")
if not base_url:
    raise ValueError("OPENAI_BASE_URL not found in '{custom_env_path}' file or environment variables.")

# Initialize the OpenAI client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
)
print("OpenAIkey:", api_key)
print("OpenAIbase_url:", base_url)
print("neo4j_url:", neo4j_url)
print("neo4j_username:", neo4j_username)
print("neo4j_password:", neo4j_password)

 has successfully loaded the environment variable from 'KEY.env'.
OpenAIkey: sk-zr7mcxQGC7slqAQiAe2Hj8KSuQsY7JMrS2a0KbuNFJdoCMY3
OpenAIbase_url: https://api.chatanywhere.tech/v1
neo4j_url: bolt://localhost:7687
neo4j_username: neo4j
neo4j_password: abc6716518


In [11]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed) 
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) 
    print(f"Set global seed to {seed}")


SEED = 42
seed_everything(SEED)

Set global seed to 42


In [12]:
train_set = pd.read_csv(train_set_path)
print(train_set.head())

                                   catalyst_material  \
0  [0.005456375889480114, 0.00605750223621726, -0...   
1  [-0.018293503671884537, -0.01903114654123783, ...   
2  [0.012553650885820389, 0.0008302437490783632, ...   
3  [0.0012598189059644938, -0.0017349310219287872...   
4  [0.001034828252159059, 0.016570936888456345, -...   

                                             oxidant  reaction_pressure  \
0  [0.008895903825759888, -0.01483992487192154, -...            1.00000   
1  [0.008895903825759888, -0.01483992487192154, -...            1.00000   
2  [0.008895903825759888, -0.01483992487192154, -...            1.01325   
3  [0.008895903825759888, -0.01483992487192154, -...            1.00000   
4  [0.0013591507449746132, -0.007010529283434153,...            1.00000   

                                      target_product  publication_year  \
0  [-0.003155792597681284, 0.007325398735702038, ...              2021   
1  [-0.006440490949898958, 0.005384290125221014, ...            

In [13]:
def get_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-ada-002" 
    ).data[0].embedding
    embedding = np.array(response)
    return embedding

In [14]:
def convert_str_to_array(data):
    
    if isinstance(data, str):
        return np.array(eval(data))  
    else:
        return np.array(data)  

In [15]:
# Neo4j connection
uri = neo4j_url  # default bolt URI or change to your URI
username = neo4j_username
password = neo4j_password

driver = GraphDatabase.driver(uri, auth=(username, password))

def create_node_and_relationship(tx, catalyst_material, oxidant, target_product, reaction_pressure, reaction_temperature, reaction_time, publication_year, target_product_yield, target_product_selectivity, target_product_conversion, liquid_composition_and_conditions):
    # Cypher query to merge nodes and relationships
    query = (
        "MERGE (a:Catalytic_Material {name: $catalyst_material}) "
        "MERGE (b:target_product_Conversion_Rate {conversion: $target_product_conversion}) "
        "MERGE (c:Oxidizer {name: $oxidant}) "
        "MERGE (d:Reaction_Pressure {pressure: $reaction_pressure}) "
        "MERGE (e:Reaction_Temperature {temperature: $reaction_temperature}) "
        "MERGE (f:Reaction_Time {time: $reaction_time}) "
        "MERGE (g:Liquid_Composition_and_Conditions {condition: $liquid_composition_and_conditions}) "
        "MERGE (h:Target_Product {name: $target_product}) "
        "MERGE (i:Target_Product_Yield {Yield: $target_product_yield}) "
        "MERGE (j:Target_Product_Selectivity {selectivity: $target_product_selectivity}) "
        "MERGE (k:publication_year {name: $publication_year}) "
        "MERGE (a)-[:conversion_to]->(b) "
        "MERGE (a)-[:appear_in]->(k) "
        "MERGE (a)-[:methane_to]->(h) "
        "MERGE (b)-[:conversion_rel_oxidant]->(c) "
        "MERGE (b)-[:conversion_rel_pressure]->(d) "
        "MERGE (b)-[:conversion_rel_temperature]->(e) "
        "MERGE (b)-[:conversion_rel_time]->(f) "
        "MERGE (b)-[:conversion_rel_liquid]->(g) "
        "MERGE (c)-[:add_oxidant]->(a) "
        "MERGE (d)-[:reaction_pressure]->(a) "
        "MERGE (e)-[:reaction_temperature]->(a) "
        "MERGE (f)-[:reaction_time]->(a) "
        "MERGE (g)-[:reaction_liquid]->(a) "
        "MERGE (h)-[:product_rel_oxidant]->(c) "
        "MERGE (h)-[:product_rel_pressure]->(d) "
        "MERGE (h)-[:product_rel_temperature]->(e) "
        "MERGE (h)-[:product_rel_time]->(f) "
        "MERGE (h)-[:product_rel_liquid]->(g) "
        "MERGE (h)-[:yield_to]->(i) "
        "MERGE (h)-[:selectivity_to]->(j) "
        "MERGE (i)-[:yield_rel_oxidant]->(c) "
        "MERGE (i)-[:yield_rel_pressure]->(d) "
        "MERGE (i)-[:yield_rel_temperature]->(e) "
        "MERGE (i)-[:yield_rel_time]->(f) "
        "MERGE (i)-[:yield_rel_liquid]->(g) "
        "MERGE (i)-[:yield_rel_material]->(a) "
        "MERGE (j)-[:selectivity_rel_oxidant]->(c) "
        "MERGE (j)-[:selectivity_rel_pressure]->(d) "
        "MERGE (j)-[:selectivity_rel_temperature]->(e) "
        "MERGE (j)-[:selectivity_rel_time]->(f) "
        "MERGE (j)-[:selectivity_rel_liquid]->(g) "
        "MERGE (j)-[:selectivity_rel_material]->(a)"
    )

    # Execute the query with all required parameters
    tx.run(query, 
           catalyst_material=catalyst_material, 
           oxidant=oxidant, 
           target_product=target_product, 
           reaction_pressure=reaction_pressure, 
           reaction_temperature=reaction_temperature, 
           reaction_time=reaction_time, 
           publication_year=publication_year, 
           target_product_yield=target_product_yield, 
           target_product_selectivity=target_product_selectivity, 
           target_product_conversion=target_product_conversion, 
           liquid_composition_and_conditions=liquid_composition_and_conditions
    )

def _delete_all_data_tx(tx):
    """一个事务函数，用于删除所有节点和关系。"""
    print("Executing Cypher to delete all existing nodes and relationships: MATCH (n) DETACH DELETE n")
    tx.run("MATCH (n) DETACH DELETE n")
    print("All existing nodes and relationships have been deleted.")

def load_to_neo4j(csv_file_path):
    with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)

        with driver.session(database="neo4j") as session: 
            print("Attempting to delete all existing nodes and relationships...")
            try:
                session.execute_write(_delete_all_data_tx)
            except Exception as e:
                print(f"Error during deletion of existing data: {e}")
                print("Aborting data loading process due to error in deletion.")
                return
        
        with driver.session() as session:
            for row in reader:
                try:
                    
                    catalyst_material = row['catalyst_material']
                    oxidant = row['oxidant']
                    target_product = row['target_product']
                    reaction_pressure = float(row['reaction_pressure'])  
                    reaction_temperature = float(row['reaction_temperature'])  
                    reaction_time = float(row['reaction_time'])  
                    publication_year = int(row['publication_year'])  
                    target_product_yield = float(row['target_product_yield'])  
                    target_product_selectivity = float(row['target_product_selectivity'])  
                    target_product_conversion = float(row['target_product_conversion'])  
                    liquid_composition_and_conditions = row['liquid_composition_and_conditions']
                    
                    
                    session.write_transaction(
                        create_node_and_relationship, 
                        catalyst_material, 
                        oxidant, 
                        target_product, 
                        reaction_pressure, 
                        reaction_temperature, 
                        reaction_time, 
                        publication_year, 
                        target_product_yield, 
                        target_product_selectivity, 
                        target_product_conversion, 
                        liquid_composition_and_conditions
                    )
                except Exception as e:
                    print(f"Error processing row: {row}. Error: {e}")

In [16]:
class MultiInputModel(nn.Module):
    def __init__(self, input_dim1=1536, input_dim2=1536, input_dim3=1, input_dim4=1536, input_dim5=1, input_dim6=1, output_dim=1):
        super(MultiInputModel, self).__init__()
        # 1536 → 1536 → 1
        self.fc1_1 = nn.Linear(input_dim1, 1536)
        self.fc1_1_tanh = nn.Tanh()
        self.fc1_2 = nn.Linear(1536, 1)
        self.fc1_2_tanh = nn.Tanh()

        self.fc2_1 = nn.Linear(input_dim2, 1536)
        self.fc2_1_tanh = nn.Tanh()
        self.fc2_2 = nn.Linear(1536, 1)
        self.fc2_2_tanh = nn.Tanh()

        self.fc4_1 = nn.Linear(input_dim4, 1536)
        self.fc4_1_tanh = nn.Tanh()
        self.fc4_2 = nn.Linear(1536, 1)
        self.fc4_2_tanh = nn.Tanh()

        
        
        # 6 → 30 → 30 → 30 → 10 → 1
        self.fc01 = nn.Linear(6, 30)
        self.fc02 = nn.Linear(30, 30)
        self.fc03 = nn.Linear(30, 30)
        self.fc04 = nn.Linear(30, 30)
        self.fc05 = nn.Linear(30, 10)
        self.fc06 = nn.Linear(10, output_dim)

    
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input1, input2, input3, input4, input5, input6):
        # 1536 to 1 tanh
        x1 = self.fc1_1(input1)
        x1 = self.fc1_1_tanh(x1)
        x1 = self.fc1_2(x1)
        x1 = self.fc1_2_tanh(x1)

        x2 = self.fc2_1(input2)
        x2 = self.fc2_1_tanh(x2)
        x2 = self.fc2_2(x2)
        x2 = self.fc2_2_tanh(x2)

        x4 = self.fc4_1(input4)
        x4 = self.fc4_1_tanh(x4)
        x4 = self.fc4_2(x4)
        x4 = self.fc4_2_tanh(x4)
        
        x3 = input3  #  (batch_size, 1)
        x5 = input5  #  (batch_size, 1)
        x6 = input6  #  (batch_size, 1)

        
        x = torch.cat((x1, x2, x3, x4, x5, x6), dim=1)  # (batch_size, 6)

        # DNN架构：6 → 30 → 30 → 10 → 1
        x = self.relu(self.fc01(x))  # 6 → 30
        #x = self.dropout(x)
        x = self.relu(self.fc02(x))  # 30 → 30
        #x = self.dropout(x)
        x = self.relu(self.fc03(x))  # 30 → 30
        #x = self.dropout(x)
        x = self.relu(self.fc04(x))  # 30 → 30
        #x = self.dropout(x)
        x = self.relu(self.fc05(x))  # 30 → 10
        x = self.fc06(x)  # 10 → 1

        x = self.sigmoid(x) * 100

        return x

In [17]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def searching(catalyst, oxidant, pressure, target, temperature, time, trainsearch):
    catalyst_embedding = get_embedding(catalyst)
    
    def convert_to_array(embedding_str):
        if isinstance(embedding_str, str):
            return np.array(eval(embedding_str))  
        else:
            return np.array(embedding_str)  

    A = trainsearch['catalyst_material'].apply(convert_to_array)
    
    cosine_similarities = np.array([cosine_similarity(x, catalyst_embedding) for x in A])
    
    top_10_indices = np.argsort(cosine_similarities)[-10:][::-1]
    
    top_10_rows = trainsearch.iloc[top_10_indices]
    
    return top_10_rows

# retrain and predict   
def train_and_predict(catalyst0, oxidant0, pressure0, target_product0, temperature0, time0, train_set, num_epochs=500, batch_size=128):
    
    retrain = searching(catalyst0, oxidant0, pressure0, target_product0, temperature0, time0, train_set)
    retrain.to_csv('process.csv', index=False)
    load_to_neo4j('process.csv')
    retrained = retrain.iloc[:, :-3] 
    catalyst = np.array(retrained.iloc[:, 0].tolist())  
    oxide = np.array(retrained.iloc[:, 1].tolist())  
    pressure = np.array(retrained.iloc[:, 2].tolist())  
    product = np.array(retrained.iloc[:, 3].tolist())  
    temperature = np.array(retrained.iloc[:, 5].tolist())  
    time = np.array(retrained.iloc[:, 6].tolist())  
    selectivity = np.array(retrained.iloc[:, 9].tolist())  

    reinput1 = np.array([convert_str_to_array(x) for x in catalyst])
    reinput2 = np.array([convert_str_to_array(x) for x in oxide])
    reinput4 = np.array([convert_str_to_array(x) for x in product])

    reinput3 = pressure.reshape(-1, 1)
    reinput5 = temperature.reshape(-1, 1)
    reinput6 = time.reshape(-1, 1)
    relabels = selectivity.reshape(-1, 1)

   
    input_1_train_tensor = torch.tensor(reinput1, dtype=torch.float32).to(device)
    input_2_train_tensor = torch.tensor(reinput2, dtype=torch.float32).to(device)
    input_3_train_tensor = torch.tensor(reinput3, dtype=torch.float32).to(device)
    input_4_train_tensor = torch.tensor(reinput4, dtype=torch.float32).to(device)
    input_5_train_tensor = torch.tensor(reinput5, dtype=torch.float32).to(device)
    input_6_train_tensor = torch.tensor(reinput6, dtype=torch.float32).to(device)
    labels_train_tensor = torch.tensor(relabels, dtype=torch.float32).to(device)

    
    model = MultiInputModel().to(device)
    model.load_state_dict(torch.load(premodel_path))
    
    
    def freeze_model_except_last_layers(model):
        for name, param in model.named_parameters():
            if 'fc05' not in name and 'fc06' not in name:
                param.requires_grad = False
    
    freeze_model_except_last_layers(model)

    
    criterion = nn.HuberLoss(delta=10)  
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)

    
    train_dataset = torch.utils.data.TensorDataset(input_1_train_tensor, input_2_train_tensor, input_3_train_tensor, input_4_train_tensor, input_5_train_tensor, input_6_train_tensor, labels_train_tensor)
    dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

   
    for epoch in range(num_epochs):
        model.train()  
        for batch in dataloader:
            inputs1, inputs2, inputs3, inputs4, inputs5, inputs6, labels = batch
            outputs = model(inputs1, inputs2, inputs3, inputs4, inputs5, inputs6)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    
    model.eval()  
    with torch.no_grad():
        print (catalyst0)
        catalyst_embedding = torch.tensor(get_embedding(catalyst0), dtype=torch.float32).to(device)
        oxidant_embedding = torch.tensor(get_embedding(oxidant0), dtype=torch.float32).to(device)
        target_product_embedding = torch.tensor(get_embedding(target_product0), dtype=torch.float32).to(device)
        print(catalyst_embedding.shape)
        
        pressure_tensor = torch.tensor([pressure0], dtype=torch.float32).unsqueeze(0).to(device)
        temperature_tensor = torch.tensor([temperature0], dtype=torch.float32).unsqueeze(0).to(device)
        time_tensor = torch.tensor([time0], dtype=torch.float32).unsqueeze(0).to(device)
        print(pressure_tensor.shape)
        
        model_input = (catalyst_embedding.unsqueeze(0), 
                       oxidant_embedding.unsqueeze(0),
                       pressure_tensor,
                       target_product_embedding.unsqueeze(0),
                       temperature_tensor,
                       time_tensor)
        
        prediction = model(*model_input)
        
    #cleaned_catalyst0 = catalyst0.replace('/', '_').replace('"', '').replace("'", '')
    #filename = f"model/C/{cleaned_catalyst0}.pth"
    #torch.save(model.state_dict(), filename)
    return prediction.item()

In [21]:
# example
prediction = train_and_predict('quartz chips', 'O2', 1, 'CH3OH', 300, 0.5, train_set)
print(f'preduction_result: {prediction}')

Attempting to delete all existing nodes and relationships...
Executing Cypher to delete all existing nodes and relationships: MATCH (n) DETACH DELETE n
All existing nodes and relationships have been deleted.


  session.write_transaction(
  model.load_state_dict(torch.load(premodel_path))


Epoch [1/500], Loss: 11.9722
Epoch [2/500], Loss: 6.4114
Epoch [3/500], Loss: 7.5571
Epoch [4/500], Loss: 8.3196
Epoch [5/500], Loss: 6.6039
Epoch [6/500], Loss: 5.2542
Epoch [7/500], Loss: 5.6259
Epoch [8/500], Loss: 6.4629
Epoch [9/500], Loss: 6.4743
Epoch [10/500], Loss: 5.7461
Epoch [11/500], Loss: 5.2652
Epoch [12/500], Loss: 5.2874
Epoch [13/500], Loss: 5.7062
Epoch [14/500], Loss: 5.9303
Epoch [15/500], Loss: 5.7022
Epoch [16/500], Loss: 5.2999
Epoch [17/500], Loss: 5.1236
Epoch [18/500], Loss: 5.2666
Epoch [19/500], Loss: 5.4358
Epoch [20/500], Loss: 5.4278
Epoch [21/500], Loss: 5.2552
Epoch [22/500], Loss: 5.0806
Epoch [23/500], Loss: 5.0385
Epoch [24/500], Loss: 5.1292
Epoch [25/500], Loss: 5.2202
Epoch [26/500], Loss: 5.1955
Epoch [27/500], Loss: 5.0771
Epoch [28/500], Loss: 4.9734
Epoch [29/500], Loss: 4.9598
Epoch [30/500], Loss: 5.1045
Epoch [31/500], Loss: 5.0896
Epoch [32/500], Loss: 4.9398
Epoch [33/500], Loss: 4.9126
Epoch [34/500], Loss: 4.9388
Epoch [35/500], Loss: 