In [1]:
import pandas as pd

In [2]:
prior = pd.read_csv("sampling_prior.csv")
finetuned = pd.read_csv("sampling_vegfr.csv")

In [3]:
prior['label'] = "Prior"

In [4]:
finetuned['label'] = 'Finetuned'

In [5]:
prior

Unnamed: 0,SMILES,NLL,label
0,CCCCCCCCCC(=O)OCOC(=O)NC(CCC(=O)NC(CS)C(=O)OC)...,42.01,Prior
1,Cc1ccccc1S(=O)(=O)NC(=O)C1(C)CCN1C(=O)CC1CCCCC1,23.50,Prior
2,CC1CCCC(NC(=O)c2ccc3c(c2)N(Cc2ccc(F)cc2)C(=O)c...,24.80,Prior
3,O=C(c1ccc2c(c1)OCO2)N1CCCC(N=c2cnc(C(F)(F)F)c[...,22.92,Prior
4,O=C(Nc1ccccc1COc1ccccn1)c1cnn2cccnc12,27.31,Prior
...,...,...,...
4911,CC(C)NC(=O)C(C)OC(=O)CCCOc1ccccc1,26.65,Prior
4912,CC(C)c1ccc(NC(=O)CN(c2ccc(N=c3cc[nH]c4cc(Cl)cc...,30.94,Prior
4913,CC(C)(Cn1cncn1)NCc1ccc2c(c1)nc(F)n2-c1ccc(Cl)c...,42.80,Prior
4914,COC(=O)NCc1cccc(CC(=O)N=c2ccc(CCCCc3n[nH]c(=NC...,31.00,Prior


In [6]:
# Combine the DataFrames vertically (row-wise)
combined_df = pd.concat([prior, finetuned])


In [7]:
combined_df

Unnamed: 0,SMILES,NLL,label
0,CCCCCCCCCC(=O)OCOC(=O)NC(CCC(=O)NC(CS)C(=O)OC)...,42.01,Prior
1,Cc1ccccc1S(=O)(=O)NC(=O)C1(C)CCN1C(=O)CC1CCCCC1,23.50,Prior
2,CC1CCCC(NC(=O)c2ccc3c(c2)N(Cc2ccc(F)cc2)C(=O)c...,24.80,Prior
3,O=C(c1ccc2c(c1)OCO2)N1CCCC(N=c2cnc(C(F)(F)F)c[...,22.92,Prior
4,O=C(Nc1ccccc1COc1ccccn1)c1cnn2cccnc12,27.31,Prior
...,...,...,...
2970,COc1ccc2c(=NCCN)cc(-c3ccccc3OC)[nH]c2c1,23.92,Finetuned
2971,Cc1ccc(C(=O)Nc2ccc3c(C(=O)Nc4ccc(-n5c(C)ccc5C)...,28.49,Finetuned
2972,CC(=O)c1ccc(OCCN2CCOCC2)cc1O,16.84,Finetuned
2973,CCN(CC)CCNCc1cc(Oc2ccc(NC(=O)Nc3ccc(C)c(C)c3)c...,23.39,Finetuned


In [8]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt


# Function to compute molecular fingerprints (Morgan Fingerprints)
def get_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        # Generate Morgan fingerprint (radius=2, 2048 bits)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        return np.array(fp)  # Convert to NumPy array
    else:
        return np.nan  # Handle invalid SMILES

# Apply the function to the SMILES column to get fingerprints
combined_df['fingerprints'] = combined_df['SMILES'].apply(get_fingerprint)

# Drop rows with invalid SMILES
total_df = combined_df.dropna(subset=['fingerprints'])

# Stack fingerprints into a feature matrix
X = np.stack(combined_df['fingerprints'].values)




In [9]:
# Apply t-SNE to reduce dimensionality to 2D
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

# Add the t-SNE results to the DataFrame
combined_df['tsne-2d-one'] = X_tsne[:, 0]
combined_df['tsne-2d-two'] = X_tsne[:, 1]

In [10]:

import plotly.express as px

fig = px.scatter(
    combined_df, x='tsne-2d-one', y='tsne-2d-two',
    hover_name='label',  # Show SMILES when hovering over points
    title='t-SNE of Molecular Fingerprints of molecules generated',
    color='label',
    width=1000,  # Set the width of the plot (in pixels)
    height=800,
    
)
fig.update_layout(
    paper_bgcolor='white',   # Background around the plot
    plot_bgcolor='white',    # Background of the plotting area
)

# Show interactive plot
fig.show()




# Visualize molecules from finetuned vs existing VEGFR1 inhibitors used for finetuning


In [19]:
vegfr = pd.read_csv("example.smi", sep="\t", header=None)
vegfr

Unnamed: 0,0
0,Cc1cc(C(=O)Nc2cc(Oc3ccc4nc(NC(=O)C5CC5)nn4c3)c...
1,Nc1c(-c2nc3ccc(N4CCOCC4)[nH]c-3n2)c(O)nc2ccccc12
2,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...
3,CN1CCN(c2ccc3nc(-c4c(N)c5c(F)cccc5[nH]c4=O)[nH...
4,CC(C)(C)c1cc(NC(=O)Nc2ccc(Nc3ncnc4c3OCCCN4)cc2...
...,...
155,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...
156,COc1cc2c(Oc3ccc(NC(=O)Nc4cnn(C5CCCC5)c4)c(Cl)c...
157,COC(=O)CNCCCNC(=O)c1cc2nccc(Oc3ccc(NC(=O)Nc4cc...
158,Cc1ccc(F)c(NC(=O)Nc2ccc(Oc3ccnc4cc(C(=O)NCCCN5...


In [20]:
vegfr = vegfr.rename(columns={0: 'SMILES'})

In [21]:
vegfr['label'] = "Chembl"

In [22]:
combined_df = pd.concat([vegfr, finetuned])

In [23]:
combined_df.reset_index(inplace=True, drop=True)

In [24]:
combined_df

Unnamed: 0,SMILES,label,NLL
0,Cc1cc(C(=O)Nc2cc(Oc3ccc4nc(NC(=O)C5CC5)nn4c3)c...,Chembl,
1,Nc1c(-c2nc3ccc(N4CCOCC4)[nH]c-3n2)c(O)nc2ccccc12,Chembl,
2,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,Chembl,
3,CN1CCN(c2ccc3nc(-c4c(N)c5c(F)cccc5[nH]c4=O)[nH...,Chembl,
4,CC(C)(C)c1cc(NC(=O)Nc2ccc(Nc3ncnc4c3OCCCN4)cc2...,Chembl,
...,...,...,...
3130,COc1ccc2c(=NCCN)cc(-c3ccccc3OC)[nH]c2c1,Finetuned,23.92
3131,Cc1ccc(C(=O)Nc2ccc3c(C(=O)Nc4ccc(-n5c(C)ccc5C)...,Finetuned,28.49
3132,CC(=O)c1ccc(OCCN2CCOCC2)cc1O,Finetuned,16.84
3133,CCN(CC)CCNCc1cc(Oc2ccc(NC(=O)Nc3ccc(C)c(C)c3)c...,Finetuned,23.39


In [25]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt


# Function to compute molecular fingerprints (Morgan Fingerprints)
def get_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        # Generate Morgan fingerprint (radius=2, 2048 bits)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        return np.array(fp)  # Convert to NumPy array
    else:
        return None  # Handle invalid SMILES

# Apply the function to the SMILES column to get fingerprints
combined_df['fingerprints'] = combined_df['SMILES'].apply(get_fingerprint)

# Drop rows with invalid SMILES
total_df = combined_df.dropna(subset=['fingerprints'])

# Stack fingerprints into a feature matrix
X = np.stack(combined_df['fingerprints'].values)




In [26]:
combined_df['fingerprints']

0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
3       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
4       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
3130    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3131    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
3132    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
3133    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3134    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: fingerprints, Length: 3135, dtype: object

In [27]:
# Apply t-SNE to reduce dimensionality to 2D
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

# Add the t-SNE results to the DataFrame
combined_df['tsne-2d-one'] = X_tsne[:, 0]
combined_df['tsne-2d-two'] = X_tsne[:, 1]

In [28]:
combined_df

Unnamed: 0,SMILES,label,NLL,fingerprints,tsne-2d-one,tsne-2d-two
0,Cc1cc(C(=O)Nc2cc(Oc3ccc4nc(NC(=O)C5CC5)nn4c3)c...,Chembl,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",19.499910,-42.618595
1,Nc1c(-c2nc3ccc(N4CCOCC4)[nH]c-3n2)c(O)nc2ccccc12,Chembl,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-41.187988,14.014186
2,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,Chembl,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",35.262821,8.262717
3,CN1CCN(c2ccc3nc(-c4c(N)c5c(F)cccc5[nH]c4=O)[nH...,Chembl,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",-46.847660,8.467082
4,CC(C)(C)c1cc(NC(=O)Nc2ccc(Nc3ncnc4c3OCCCN4)cc2...,Chembl,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",19.665030,-36.289669
...,...,...,...,...,...,...
3130,COc1ccc2c(=NCCN)cc(-c3ccccc3OC)[nH]c2c1,Finetuned,23.92,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-15.422573,20.659374
3131,Cc1ccc(C(=O)Nc2ccc3c(C(=O)Nc4ccc(-n5c(C)ccc5C)...,Finetuned,28.49,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",0.142232,-15.010788
3132,CC(=O)c1ccc(OCCN2CCOCC2)cc1O,Finetuned,16.84,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",12.174632,14.669611
3133,CCN(CC)CCNCc1cc(Oc2ccc(NC(=O)Nc3ccc(C)c(C)c3)c...,Finetuned,23.39,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",28.943029,-29.556175


In [29]:

import plotly.express as px

fig = px.scatter(
    combined_df, x='tsne-2d-one', y='tsne-2d-two',
    hover_name='label',  # Show SMILES when hovering over points
    title='t-SNE of Molecular Fingerprints of molecules generated from Finetuned model vs Chembl ',
    color='label',
    width=1000,  # Set the width of the plot (in pixels)
    height=800 

)
fig.update_layout(
    paper_bgcolor='white',   # Background around the plot
    plot_bgcolor='white',    # Background of the plotting area
)
# Show interactive plot
fig.show()

