In [1]:
import pandas as pd
import numpy as np

# Load dataset (replace with your actual file path)
df = pd.read_csv("antiviral_drug_discovery.csv")

# Check first few rows
df.head()

Unnamed: 0,SMILES,IC50,EC50,Ki,Kd,PDB_ID,Binding_Affinity,Absorption,Distribution,Metabolism,Excretion,Toxicity
0,c1nc2ccc(N)c(cc1)c(c3c(c4ccc(c5ccc4OC)c5)ccc(O...,16.106,37.901,21.631,51.336,4G6D,-6.817,0.9922,3.342,0.1809,0.0101,0.623
1,C1=CC(CNC2=C1C3=CC=C(C=C3)C4=CC(C=C(C=C4)C(C)=...,0.0,149.89,128.0,124.99,3P2Z,-7.2,0.413,4.04,0.0,0.196,1.55
2,CCC(CC(=O)N(C(=O)NC(=O)C(=C(C(=O)N(C(=O)N(C(=O...,12.35,11.87,14.97,2.82,6J5K,-7.516,58.88,0.957,2.464,0.885,1.371
3,CCOC1=CC(=C1)C(=O)C1=C(C=O)C(C(=O)C=C1C(=O)C1)...,3.46,1.06,3.38,0.33,2E6O,-7.275,0.224,1.87,0.025,0.007,0.956
4,CC(O)=C1C(O)N(C(O)=O)C(=O)N1,28.54,1.29,7.92,4.32,1j1c,-7.878,0.999,0.98,0.948,0.999,0.018


In [2]:
print(df.isnull().sum())  # Count missing values per column

# Fill missing numerical values with the mean
df.fillna(df.select_dtypes(include=['number']).mean(), inplace=True)

# Fill missing categorical values with mode
df.fillna(df.mode().iloc[0], inplace=True)

SMILES              0
IC50                0
EC50                0
Ki                  0
Kd                  0
PDB_ID              0
Binding_Affinity    0
Absorption          0
Distribution        0
Metabolism          0
Excretion           0
Toxicity            0
dtype: int64


In [3]:
df.drop_duplicates(inplace=True)

In [4]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['ic50', 'ec50', 'ki', 'kd']] = scaler.fit_transform(df[['ic50', 'ec50', 'ki', 'kd']])

In [6]:
df.to_csv("cleaned_antiviral_drug_discovery.csv", index=False)

In [7]:
print(df.isnull().sum())  # Count missing values per column

# Fill missing numerical values with the mean
df.fillna(df.select_dtypes(include=['number']).mean(), inplace=True)

# Fill missing categorical values with mode
df.fillna(df.mode().iloc[0], inplace=True)

smiles              0
ic50                0
ec50                0
ki                  0
kd                  0
pdb_id              0
binding_affinity    0
absorption          0
distribution        0
metabolism          0
excretion           0
toxicity            0
dtype: int64


In [8]:
pip install --upgrade rdkit




In [9]:
df=pd.read_csv('cleaned_valid_smiles.csv')
df

Unnamed: 0,smiles,ic50,ec50,ki,kd,pdb_id,binding_affinity,absorption,distribution,metabolism,excretion,toxicity
0,CCOC1=CC(=C1)C(=O)C1=C(C=O)C(C(=O)C=C1C(=O)C1)...,0.0173,0.005988,0.012637,0.000595,2E6O,-7.275,0.224,1.87,0.025,0.007,0.956
1,CC(O)=C1C(O)N(C(O)=O)C(=O)N1,0.1427,0.007422,0.029612,0.012643,1j1c,-7.878,0.999,0.98,0.948,0.999,0.018
2,C1=CC=CC=C1,0.055397,0.095251,0.009069,0.001875,3L2M,-6.60308,0.6137,0.9393,0.9012,0.9544,1.4972
3,C1=CC=CC=C1C(=O)O,0.00677,0.016822,0.002632,0.003491,1F0X,-9.613,0.95,1.583,1.0,0.324,1.14
4,CC(C)C(C)C1=CC=CC(=O)C=C1,0.3345,0.124119,0.055709,0.126725,5JBK,-7.724,0.8856,1.355,1.0,1.0,1.0
5,CC(=O)C(=O)[N+]([O-])=O,0.175,0.074222,0.041128,0.109105,2FU4,-7.673556,0.7482,2.81,0.4534,0.1894,-2.460691
6,c1cc(ccc1)OCC(C(=O)Nc2ccccc2)C1=CC=C(O)C=C1,0.37355,0.101291,0.417408,0.072462,2L2S,-7.821,0.127,11.25,0.145,1.249,0.46
7,CC(=O)NC1CN(CCCN1)C,0.1423,1.0,0.037351,0.035924,6k45,-10.9,0.99,116.1,0.77,0.41,0.7
8,CC(O)CC(=O)[C@@H](NC1=CC2=CC=CC=C2SC(=O)C(=O)C...,0.1248,0.014907,0.005234,0.002618,6H0Q,-6.70726,0.153774,1.97,0.014323,0.156042,2.428
9,CN1CCC(=C(OC(=C1C)C)C)C(=C1)CCC1,0.0096,0.010229,0.066477,0.03372,6LVK,-8.11,0.64,6.13,0.105,0.374,2.01


In [10]:
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        return np.array(fp)
    return None

In [11]:
# Example
smiles = "CCO"
fingerprint = smiles_to_fingerprint(smiles)
print(fingerprint.shape)  # (2048,)

(2048,)




In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, BatchNormalization
df["fingerprint"] = df["smiles"].apply(lambda x: smiles_to_fingerprint(x))

# Remove empty fingerprints
df = df.dropna(subset=["fingerprint"])
X_train = np.array(df["fingerprint"].tolist())  # Convert to NumPy array



In [13]:
#Build the Generator
def build_generator(input_dim=100, output_dim=2048):
    model = Sequential([
        Dense(256, input_dim=input_dim),
        LeakyReLU(alpha=0.2),
        BatchNormalization(momentum=0.8),
        Dense(512),
        LeakyReLU(alpha=0.2),
        BatchNormalization(momentum=0.8),
        Dense(1024),
        LeakyReLU(alpha=0.2),
        BatchNormalization(momentum=0.8),
        Dense(output_dim, activation="sigmoid")  # Output drug fingerprints
    ])
    return model


In [14]:
#Build the Discriminator
def build_discriminator(input_dim=2048):
    model = Sequential([
        Dense(1024, input_dim=input_dim),
        LeakyReLU(alpha=0.2),
        Dense(512),
        LeakyReLU(alpha=0.2),
        Dense(256),
        LeakyReLU(alpha=0.2),
        Dense(1, activation="sigmoid")  # Binary classification (Real or Fake)
    ])
    return model

In [15]:
#Compile the GAN
# Initialize models
generator = build_generator()
discriminator = build_discriminator()

# Compile Discriminator
discriminator.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(0.0002, 0.5), metrics=["accuracy"])

# Freeze Discriminator for GAN training
discriminator.trainable = False

# Build GAN
gan_input = tf.keras.Input(shape=(100,))
generated_drugs = generator(gan_input)
validity = discriminator(generated_drugs)

gan = tf.keras.Model(gan_input, validity)
gan.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(0.0002, 0.5))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
import matplotlib.pyplot as plt

# Training Parameters
epochs = 500
batch_size = 32
noise_dim = 100

# Labels for real and fake drugs
real = np.ones((batch_size, 1))
fake = np.zeros((batch_size, 1))

# Training Loop
for epoch in range(epochs):
    # Train Discriminator
    idx = np.random.randint(0, X_train.shape[0], batch_size)
    real_drugs = X_train[idx]
    noise = np.random.normal(0, 1, (batch_size, noise_dim))
    generated_drugs = generator.predict(noise)
    
    d_loss_real = discriminator.train_on_batch(real_drugs, real)
    d_loss_fake = discriminator.train_on_batch(generated_drugs, fake)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    # Train Generator
    noise = np.random.normal(0, 1, (batch_size, noise_dim))
    g_loss = gan.train_on_batch(noise, real)

    # Print Progress
    if epoch % 100 == 0:
        print(f"Epoch {epoch}: D Loss={d_loss[0]}, G Loss={g_loss}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step




Epoch 0: D Loss=0.715165913105011, G Loss=0.6763705015182495
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 285ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

In [17]:
# Save the trained Generator
generator.save("gan_generator.h5")



In [18]:
# generated_molecular_fingerprints
generator = tf.keras.models.load_model("gan_generator.h5")
noise = np.random.normal(0, 1, (5, 100))
generated_fingerprints = generator.predict(noise)
print(generated_fingerprints) #These are the generated molecular fingerprints produced by your GAN model.

# Each row in this (5, 2048) NumPy array represents a 2048-bit fingerprint of a newly generated drug-like molecule. 



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 617ms/step
[[4.5994270e-01 8.2397902e-01 2.0875759e-01 ... 1.5297905e-01
  9.7624075e-01 9.1541272e-01]
 [9.2375100e-01 5.4438019e-01 8.7494415e-01 ... 9.5357460e-01
  7.1323775e-03 6.8614088e-02]
 [9.7489125e-01 5.9376085e-01 9.1186017e-01 ... 9.5850009e-01
  3.2667493e-04 2.4166657e-02]
 [9.2497981e-01 9.8754418e-01 8.8219938e-05 ... 6.7192632e-05
  9.9985993e-01 9.9684429e-01]
 [1.6240633e-01 6.5367162e-01 5.3384914e-03 ... 3.1099957e-03
  9.9926895e-01 9.9096930e-01]]


In [19]:
# Binary Fingerprints
binary_fingerprints = (generated_fingerprints > 0.5).astype(int)

# Print the binary fingerprints
print(binary_fingerprints)

[[0 1 0 ... 0 1 1]
 [1 1 1 ... 1 0 0]
 [1 1 1 ... 1 0 0]
 [1 1 0 ... 0 1 1]
 [0 1 0 ... 0 1 1]]


In [20]:
# Convert fingerprints back to SMILES (using RDKit)
from rdkit import DataStructs

def fingerprint_to_smiles(fp):
    # Placeholder function: You will need a trained model to convert fingerprints back to SMILES.
    return "Generated_SMILES_String"

generated_smiles = [fingerprint_to_smiles(fp) for fp in generated_fingerprints]
print(generated_smiles)

['Generated_SMILES_String', 'Generated_SMILES_String', 'Generated_SMILES_String', 'Generated_SMILES_String', 'Generated_SMILES_String']


In [21]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import numpy as np
import pandas as pd

# Load your reference dataset (SMILES and their fingerprints)
ref_df = pd.read_csv("C:/Users/Dell/Downloads/GDG Solution challenge/Fingerprints_with_SMILES.csv")  # Contains 'smiles' column
ref_fingerprints = [
    AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), 2, nBits=2048)
    for x in ref_df["SMILES"]
]

# Function to find the closest match
def fingerprint_to_smiles(generated_fp, ref_fingerprints, ref_smiles):
    max_similarity = 0
    best_match = None
    
    for ref_fp, smiles in zip(ref_fingerprints, ref_smiles):
        sim = DataStructs.FingerprintSimilarity(generated_fp, ref_fp)
        if sim > max_similarity:
            max_similarity = sim
            best_match = smiles
            
    return best_match, max_similarity

# Convert your generated fingerprints to RDKit format
generated_fps = [
    DataStructs.ExplicitBitVect(len(fp)) for fp in generated_fingerprints
]

# Set the bits manually
for i, fp in enumerate(generated_fingerprints):
    for bit_idx in np.where(fp)[0]:  # Get indices of 1s
        generated_fps[i].SetBit(int(bit_idx))  # Ensure it's a Python int
# Find closest molecules
for fp in generated_fps:
    smiles, similarity = fingerprint_to_smiles(fp, ref_fingerprints, ref_df["SMILES"])
    print(f"Generated Molecule: {smiles}, Similarity: {similarity}")




Generated Molecule: CCC1=CC2Cc3nc4cc(Cl)ccc4c(NCCn4cc(CCCCCCN5CCc6cc(OC)c(OC)cc6C5c5cccc([N+](=O)[O-])c5)nn4)c3C(C1)C2, Similarity: 0.05126953125
Generated Molecule: CCC1=CC2Cc3nc4cc(Cl)ccc4c(NCCn4cc(CCCCCCN5CCc6cc(OC)c(OC)cc6C5c5cccc([N+](=O)[O-])c5)nn4)c3C(C1)C2, Similarity: 0.05126953125
Generated Molecule: CCC1=CC2Cc3nc4cc(Cl)ccc4c(NCCn4cc(CCCCCCN5CCc6cc(OC)c(OC)cc6C5c5cccc([N+](=O)[O-])c5)nn4)c3C(C1)C2, Similarity: 0.05126953125
Generated Molecule: CCC1=CC2Cc3nc4cc(Cl)ccc4c(NCCn4cc(CCCCCCN5CCc6cc(OC)c(OC)cc6C5c5cccc([N+](=O)[O-])c5)nn4)c3C(C1)C2, Similarity: 0.05126953125
Generated Molecule: CCC1=CC2Cc3nc4cc(Cl)ccc4c(NCCn4cc(CCCCCCN5CCc6cc(OC)c(OC)cc6C5c5cccc([N+](=O)[O-])c5)nn4)c3C(C1)C2, Similarity: 0.05126953125
