In [2]:
!pip install pandas requests rdkit ipywidgets tqdm openai faiss-cpu

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting openai
  Downloading openai-1.30.5-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 

In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import ipywidgets as widgets
from IPython.display import display, HTML
import io
from tqdm.notebook import tqdm

# Function to generate Morgan fingerprint
def generate_morgan_fingerprint(smiles, radius=2, n_bits=2048):
    if isinstance(smiles, str):
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            return list(fingerprint)
    return None

# Function to process the uploaded file
def process_file(uploaded_file):
    content = uploaded_file['content']
    df = pd.read_csv(io.StringIO(content.decode('utf-8')))

    # Debugging: Print the first few rows of the dataframe to check the SMILES column
    print("Initial DataFrame:")
    print(df.head())

    # Convert SMILES column to strings and handle missing values
    df['SMILES'] = df['SMILES'].astype(str)

    # Generate Morgan fingerprints
    print("Generating Morgan fingerprints...")
    df['MorganFingerprint'] = df['SMILES'].apply(lambda x: generate_morgan_fingerprint(x))

    # Debugging: Check the fingerprints
    print("Morgan Fingerprints:")
    print(df['MorganFingerprint'].head())

    # Display the updated DataFrame as a table
    display(HTML(df.to_html()))

    # Save the updated DataFrame to a new CSV file
    output_file = 'updated_chemical_data.csv'
    df.to_csv(output_file, index=False)
    print(f"File processed and saved as {output_file}")

# Create the file upload widget
upload_widget = widgets.FileUpload(accept='.csv', multiple=False)

# Define the function to handle the file upload
def handle_upload(change):
    for filename, file_info in upload_widget.value.items():
        process_file(file_info)

upload_widget.observe(handle_upload, names='value')

# Display the upload widget
display(upload_widget)

FileUpload(value={}, accept='.csv', description='Upload')

Initial DataFrame:
                                       Compound_name       CASRN  \
0                       1-chloro-4-isocyanatobenzene    104-12-1   
1                   2-methylnonyl diphenyl phosphite  26544-23-0   
2                                cyclohexanone oxime    100-64-1   
3  2-(1,3-benzothiazol-2-yldisulfanyl)-1,3-benzot...    120-78-5   
4     N-[3-(morpholin-4-yl)propyl]naphthalen-1-amine   5235-82-5   

                               SMILES                     InChIKey  LD50  \
0                   O=C=Nc1ccc(Cl)cc1  ADAKRBAJFHTIEW-UHFFFAOYSA-N  3505   
1  CC(C)CCCCCCCOP(Oc1ccccc1)Oc1ccccc1  ADRNSOYXKABLGT-UHFFFAOYSA-N  5000   
2                         O=NC1CCCCC1  AFLQDEOAJRGCOW-UHFFFAOYSA-N  5000   
3      c1ccc2sc(SSc3nc4ccccc4s3)nc2c1  AFZSMODLJJCVPP-UHFFFAOYSA-N  7940   
4        c1ccc2c(NCCCN3CCOCC3)cccc2c1  AFZZFBZPHUDOLV-UHFFFAOYSA-N  2000   

   GHS class  Call  
0          5     0  
1          5     0  
2          5     0  
3          5     0  
4         

Unnamed: 0,Compound_name,CASRN,SMILES,InChIKey,LD50,GHS class,Call,MorganFingerprint
0,1-chloro-4-isocyanatobenzene,104-12-1,O=C=Nc1ccc(Cl)cc1,ADAKRBAJFHTIEW-UHFFFAOYSA-N,3505,5,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
1,2-methylnonyl diphenyl phosphite,26544-23-0,CC(C)CCCCCCCOP(Oc1ccccc1)Oc1ccccc1,ADRNSOYXKABLGT-UHFFFAOYSA-N,5000,5,0,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
2,cyclohexanone oxime,100-64-1,O=NC1CCCCC1,AFLQDEOAJRGCOW-UHFFFAOYSA-N,5000,5,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
3,"2-(1,3-benzothiazol-2-yldisulfanyl)-1,3-benzothiazole",120-78-5,c1ccc2sc(SSc3nc4ccccc4s3)nc2c1,AFZSMODLJJCVPP-UHFFFAOYSA-N,7940,5,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
4,N-[3-(morpholin-4-yl)propyl]naphthalen-1-amine,5235-82-5,c1ccc2c(NCCCN3CCOCC3)cccc2c1,AFZZFBZPHUDOLV-UHFFFAOYSA-N,2000,5,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
5,4-phenylbutan-2-one,2550-26-7,CC(=O)CCc1ccccc1,AKGGYBADQZYZPD-UHFFFAOYSA-N,5000,5,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
6,"pentane-1,5-diol",111-29-5,OCCCCCO,ALQSHHUCVQOPAS-UHFFFAOYSA-N,19800,5,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
7,2-methylpropanal,78-84-2,CC(C)C=O,AMIMRNSIRUDHCM-UHFFFAOYSA-N,2000,5,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
8,pentan-1-ol,71-41-0,CCCCCO,AMQJEAYHLZJPGS-UHFFFAOYSA-N,2860,5,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"


File processed and saved as updated_chemical_data.csv


In [3]:
import pandas as pd
import openai
import faiss
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import ipywidgets as widgets
from IPython.display import display, HTML
import io
import time

# Install necessary libraries
!pip install openai==0.28

# Set your OpenAI API key
openai.api_key = 'sk-proj-3emuQGqpQTAjMQF9HwWST3BlbkFJ4aeWLUD7OV674KHSkOd8'

# Function to generate embeddings using OpenAI API
def generate_embedding(text, model="text-embedding-3-large"):
    text = text.replace("\n", " ")
    response = openai.Embedding.create(input=[text], model=model)
    return response['data'][0]['embedding']

# Function to generate Morgan fingerprint
def generate_morgan_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        return list(fingerprint)
    else:
        return None

# Function to process the uploaded file and create sentences
def process_file_and_create_sentences(uploaded_file):
    progress_label.value = "Reading and processing the CSV file..."
    content = uploaded_file['content']
    df = pd.read_csv(io.StringIO(content.decode('utf-8')))

    # Initialize the MorganFingerprint column
    df['MorganFingerprint'] = None

    # Generate Morgan fingerprints
    progress_label.value = "Generating Morgan fingerprints..."
    progress_bar.max = len(df)
    for i, smiles in enumerate(df['SMILES']):
        df.at[i, 'MorganFingerprint'] = generate_morgan_fingerprint(smiles)
        progress_bar.value = i + 1
        time.sleep(0.01)  # Simulate processing delay

    # Convert Morgan fingerprints to binary strings for inclusion in sentences
    progress_label.value = "Converting Morgan fingerprints to binary strings..."
    df['FingerprintString'] = df['MorganFingerprint'].apply(lambda x: ''.join(map(str, x)) if x is not None else 'None')

    # Define the sentence template
    sentence_template = "{Compound_name} with CASRN {CASRN} has an LD50 of {LD50} mg/kg, is classified as GHS class {GHS_class}, has InChIKey {InChIKey}, and Morgan fingerprint {FingerprintString}."

    # Generate sentences from the DataFrame
    progress_label.value = "Generating sentences..."
    df['Sentence'] = df.apply(lambda row: sentence_template.format(
        Compound_name=row['Compound_name'],
        CASRN=row['CASRN'],
        LD50=row['LD50'],
        GHS_class=row['GHS class'],
        InChIKey=row['InChIKey'],
        FingerprintString=row['FingerprintString']
    ), axis=1)

    # Return the DataFrame with sentences and Morgan fingerprints for further processing
    progress_label.value = "CSV file processed."
    return df

# Embed sentences and store them in a vector database along with InChIKey and MorganFingerprints
def embed_and_store(df):
    progress_label.value = "Generating embeddings and storing them in the vector database..."
    # Extract sentences and other relevant columns from the DataFrame
    sentences = df['Sentence'].tolist()
    inchikeys = df['InChIKey'].tolist()
    fingerprints = df['MorganFingerprint'].tolist()

    # Generate embeddings for the sentences
    progress_bar.max = len(sentences)
    embeddings = []
    for i, sentence in enumerate(sentences):
        embeddings.append(generate_embedding(sentence))
        progress_bar.value = i + 1
        time.sleep(0.01)  # Simulate processing delay

    # Convert embeddings and fingerprints to numpy arrays
    embeddings = np.array(embeddings).astype('float32')
    fingerprints = np.array([np.array(fp, dtype=np.float32) if fp is not None else np.zeros(2048) for fp in fingerprints])

    # Initialize the FAISS index
    dimension = embeddings.shape[1] + fingerprints.shape[1]
    index = faiss.IndexFlatL2(dimension)

    # Combine embeddings and fingerprints
    combined_data = np.hstack((embeddings, fingerprints))

    # Add combined data to the index
    index.add(combined_data)

    # Save the index to disk
    faiss.write_index(index, "vector_index.index")
    progress_label.value = "Embeddings and fingerprints stored in the vector database."

    # Save the InChIKeys for reference
    np.save("inchikeys.npy", np.array(inchikeys))
    progress_label.value = "InChIKeys saved."

    # Save the original DataFrame for reference
    df.to_csv("compounds_data.csv", index=False)
    progress_label.value = "Compounds data saved."

# Function to handle the file upload and process the data
def handle_upload_and_process(change):
    for filename, file_info in upload_widget.value.items():
        df_with_sentences = process_file_and_create_sentences(file_info)
        embed_and_store(df_with_sentences)

# Create the file upload widget
upload_widget = widgets.FileUpload(accept='.csv', multiple=False)

# Define the function to handle the file upload
upload_widget.observe(handle_upload_and_process, names='value')

# Display the upload widget
display(upload_widget)

# Function to search for the nearest compounds and send output to GPT-4
def search_nearest_compounds(smiles, k=5):
    progress_label.value = "Searching for nearest compounds..."
    # Load the saved FAISS index
    index = faiss.read_index("vector_index.index")

    # Load the InChIKeys and original DataFrame
    inchikeys = np.load("inchikeys.npy", allow_pickle=True)
    df = pd.read_csv("compounds_data.csv")

    # Generate Morgan fingerprint for the input SMILES
    fingerprint = generate_morgan_fingerprint(smiles)
    if fingerprint is None:
        progress_label.value = "Invalid SMILES string."
        return

    # Convert fingerprint to numpy array
    fingerprint_array = np.array(fingerprint, dtype=np.float32)

    # Generate the embedding for the input SMILES
    sentence_template = "Compound with SMILES {} and Morgan fingerprint {}.".format(smiles, ''.join(map(str, fingerprint)))
    embedding = generate_embedding(sentence_template)
    embedding_array = np.array(embedding, dtype=np.float32)

    # Combine the embedding and fingerprint
    query_vector = np.hstack((embedding_array, fingerprint_array)).reshape(1, -1)

    # Search for the nearest compounds
    distances, indices = index.search(query_vector, k)

    # Extract the nearest compounds
    nearest_compounds = df.iloc[indices[0]]
    compound_info = nearest_compounds[['Compound_name', 'LD50', 'GHS class']]

    # Create the prompt for GPT-4
    prompt = "You are an expert in chemical toxicology and your job is to take the 5 compounds that are the most similar to an unknown compound and provide the following:\n\n"
    prompt += compound_info.to_string(index=False, header=False)
    prompt += "\n\nThen, using that information, predict if the unknown compound is highly toxic, toxic, moderately toxic, slightly toxic, or non-toxic."

    # Send the prompt to GPT-4
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an expert in chemical toxicology and your job is to take the 5 compounds that are the most similar to a unknown compound and provide the following:\n\nCompound name\nLD50\nGHS Class\n\nThen, using that information, predict if the unknown compound is highly toxic, toxic, moderately toxic, slightly toxic, or non-toxic."},
            {"role": "user", "content": prompt}
        ]
    )

    # Print the response from GPT-4
    print(response['choices'][0]['message']['content'])
    progress_label.value = "Nearest compounds found and prediction made."

# Create a text input widget for SMILES
smiles_input = widgets.Text(
    value='',
    placeholder='Enter SMILES string',
    description='SMILES:',
    disabled=False
)

# Create a button to perform the search
search_button = widgets.Button(
    description='Search',
    disabled=False,
    button_style='',
    tooltip='Click to search for nearest compounds',
    icon='search'
)

# Define the function to handle the search button click
def on_search_button_click(b):
    smiles = smiles_input.value
    search_nearest_compounds(smiles)

search_button.on_click(on_search_button_click)

# Display the SMILES input and search button
display(smiles_input, search_button)

# Create progress bar and label
progress_bar = widgets.IntProgress(
    value=0,
    min=0,
    max=100,
    step=1,
    description='Progress:',
    bar_style='info',
    orientation='horizontal'
)
progress_label = widgets.Label(value="")

# Display progress bar and label
display(progress_bar, progress_label)




FileUpload(value={}, accept='.csv', description='Upload')

Text(value='', description='SMILES:', placeholder='Enter SMILES string')

Button(description='Search', icon='search', style=ButtonStyle(), tooltip='Click to search for nearest compound…

IntProgress(value=0, bar_style='info', description='Progress:')

Label(value='')

Based on the LD50 values of the closest compounds, and their GHS classes, we have a range from 275 mg/kg (quite toxic) to 2000 mg/kg (moderately toxic).

Considering the similarities and range of LD50 values in these similar compounds, it is predicted that the unknown compound falls within the "Moderately Toxic" category.
