<a href="https://colab.research.google.com/github/Ganesh-2250/Reactorious---NMR-for-Molecular-Substructure-Prediction/blob/main/ChemA2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rdkit



In [None]:
import pickle, torch, numpy as np, pandas as pd
from rdkit import Chem
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch.nn as nn


In [None]:
import gdown

gdown.download(
    "https://drive.google.com/uc?id=1jIia0kHCUD7fr3VI6UFkRViSGBpGYmVg",
    "dataset.pkl",
    quiet=False
)

with open("dataset.pkl", "rb") as f:
    data = pickle.load(f)


Downloading...
From: https://drive.google.com/uc?id=1jIia0kHCUD7fr3VI6UFkRViSGBpGYmVg
To: /content/dataset.pkl
100%|██████████| 85.3M/85.3M [00:00<00:00, 146MB/s]


In [None]:
print(data.columns.tolist())


['NMREDATA_SOLVENT', 'NMREDATA_TEMPERATURE', 'NMREDATA_SMILES', 'NMREDATA_INCHI', 'NMREDATA_1D_13C', 'mol', 'NMREDATA_1D_1H']


In [None]:
import re
import numpy as np

c13_col = "NMREDATA_1D_13C"
temp_col = "NMREDATA_TEMPERATURE"
solv_col = "NMREDATA_SOLVENT"

data = data.drop(columns=["NMREDATA_1D_1H", "NMREDATA_INCHI"], errors="ignore")
data = data.dropna(subset=[c13_col, temp_col, solv_col])

data[solv_col] = data[solv_col].astype(str).str.replace("\\", "", regex=False)

def clean_temperature(x):
    m = re.search(r"[-+]?\d*\.?\d+", str(x))
    return float(m.group()) if m else np.nan

data[temp_col] = data[temp_col].apply(clean_temperature)
data = data.dropna(subset=[temp_col])


In [None]:
import re

def extract_floats(text):
    return [float(x) for x in re.findall(r"[-+]?\d*\.?\d+", str(text))]

max_peaks = data[c13_col].apply(lambda x: len(extract_floats(x))).max()

def spectrum_to_tensor(spec):
    vals = extract_floats(spec)
    vals += [0.0] * (max_peaks - len(vals))
    return torch.tensor(vals[:max_peaks], dtype=torch.float32)

data["spec_tensor"] = data[c13_col].apply(spectrum_to_tensor)


In [None]:
temp_max = data[temp_col].max()

data["temp_tensor"] = data[temp_col].apply(
    lambda x: torch.tensor([x / temp_max], dtype=torch.float32)
)


In [None]:
solvents = sorted(data[solv_col].unique())
solvent_map = {s: i for i, s in enumerate(solvents)}

def solvent_to_tensor(s):
    t = torch.zeros(len(solvent_map))
    t[solvent_map[s]] = 1.0
    return t

data["solvent_tensor"] = data[solv_col].apply(solvent_to_tensor)


In [None]:
def make_input(r):
    return torch.cat([r["spec_tensor"], r["temp_tensor"], r["solvent_tensor"]])

data["input_tensor"] = data.apply(make_input, axis=1)


In [None]:
fg_smarts = {
"sulfoxide":"[#6][#16X3]=[OX1]",
"carbamate":"[NX3][CX3](=[OX1])[OX2]",
"sulfonamide":"[#16X4]([NX3])(=[OX1])(=[OX1])[#6]",
"ring_C_C_bond":"[#6;R][#6;R]",
"ring_C_C_aromatic":"[#6;R]=,:[#6;R]",
"hydrazone":"[NX3][NX2]=[#6]",
"ether":"[OD2]([#6])[#6]",
"amide":"[NX3][CX3](=[OX1])[#6]",
"thiol_sulfide":"[#6][#16X2]",
"methylene":"[CX4H2]",
"quaternary_carbon":"[CX4H0]",
"benzyl":"[CX4][cX3]1[cX3][cX3][cX3][cX3][cX3]1",
"ring_C_N":"[#6;R][#7;R]",
"alkene":"[CX3]=[CX3]",
"sulfonate":"[#16X4](=[OX1])(=[OX1])([#6])[OX2]",
"haloalkane":"[#6][F,Cl,Br,I]",
"methyl":"[CX4H3]",
"imine":"[$([CX3]([#6])[#6]),$([CX3H][#6])]=[$([NX2][#6]),$([NX2H])]",
"aminal_like":"[N][C][N]",
"phosphine":"[#6][PX3]",
"allene":"[C](=C)(=C)",
"nitrile":"[NX1]#[CX2]",
"carboxylic_acid":"[CX3](=O)[OX2H1]",
"aldehyde":"[CX3H1](=O)[#6]",
"alkyne":"[CX2]#[CX2]",
"thioamide":"[NX3][CX3]=[SX1]",
"ester":"[#6][CX3](=O)[OX2H0][#6]",
"methine":"[CX4H1]",
"generic_imine":"[C](=N)",
"carbonyl":"[CX3]=[OX1]"
}


In [None]:
def fg_smarts_labels(mol):
    return torch.tensor(
        [int(mol.HasSubstructMatch(Chem.MolFromSmarts(s))) for s in fg_smarts.values()],
        dtype=torch.float32
    )

data["labels"] = data["mol"].apply(fg_smarts_labels)


In [None]:
X = torch.stack(data["input_tensor"].tolist())
y = torch.stack(data["labels"].tolist())

dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_ds, test_ds = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Net(nn.Module):
    def __init__(self, inp):
        super().__init__()
        self.fc1 = nn.Linear(inp, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, 100)
        self.out = nn.Linear(100, 30)
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return torch.sigmoid(self.out(x))

model = Net(X.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [None]:
def accuracy(loader):
    model.eval()
    c, t = 0, 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = (model(xb) > 0.5).float()
            c += (preds == yb).sum().item()
            t += yb.numel()
    return c / t

for epoch in range(10):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(xb), yb)
        loss.backward()
        optimizer.step()
    print(epoch, accuracy(train_loader), accuracy(test_loader))


0 0.8098095580309019 0.8095785440613027
1 0.7921068391424123 0.7936302681992338
2 0.776895436579231 0.7752394636015326
3 0.8011618157863217 0.8059865900383142
4 0.8149359204695173 0.8189176245210728
5 0.8281351060007186 0.830316091954023
6 0.822266139657444 0.8284961685823755
7 0.813570487483531 0.8161877394636016
8 0.8422206252245777 0.8465996168582376
9 0.8316205533596838 0.8363984674329502


In [None]:
torch.save(model.state_dict(), "model.pkl")


**CNN MODEL**


In [None]:
X_spec = torch.stack(data["spec_tensor"].tolist())
X_temp = torch.stack(data["temp_tensor"].tolist())
X_sol = torch.stack(data["solvent_tensor"].tolist())
y = torch.stack(data["labels"].tolist())


In [None]:
from torch.utils.data import TensorDataset, DataLoader, random_split

dataset = TensorDataset(X_spec, X_temp, X_sol, y)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_ds, test_ds = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class CNNNet(torch.nn.Module):
    def __init__(self, spec_len, temp_dim, sol_dim):
        super().__init__()
        self.conv1 = torch.nn.Conv1d(1, 16, kernel_size=5, padding=2)
        self.conv2 = torch.nn.Conv1d(16, 32, kernel_size=5, padding=2)
        self.pool = torch.nn.AdaptiveMaxPool1d(1)

        self.fc1 = torch.nn.Linear(32 + temp_dim + sol_dim, 100)
        self.fc2 = torch.nn.Linear(100, 100)
        self.out = torch.nn.Linear(100, 30)

    def forward(self, spec, temp, sol):
        x = spec.unsqueeze(1)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = self.pool(x).squeeze(-1)

        x = torch.cat([x, temp, sol], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return torch.sigmoid(self.out(x))


In [None]:
model = CNNNet(
    spec_len=X_spec.shape[1],
    temp_dim=X_temp.shape[1],
    sol_dim=X_sol.shape[1]
).to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [None]:
def accuracy(loader):
    model.eval()
    c, t = 0, 0
    with torch.no_grad():
        for xs, xt, xsol, yb in loader:
            xs, xt, xsol, yb = xs.to(device), xt.to(device), xsol.to(device), yb.to(device)
            preds = (model(xs, xt, xsol) > 0.5).float()
            c += (preds == yb).sum().item()
            t += yb.numel()
    return c / t


In [None]:
for epoch in range(10):
    model.train()
    for xs, xt, xsol, yb in train_loader:
        xs, xt, xsol, yb = xs.to(device), xt.to(device), xsol.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(xs, xt, xsol), yb)
        loss.backward()
        optimizer.step()
    print(epoch, accuracy(train_loader), accuracy(test_loader))


0 0.7958078811833753 0.7923371647509578
1 0.8342675769553239 0.8373563218390805
2 0.8219547251167805 0.8259099616858238
3 0.8337884776619955 0.8332854406130268
4 0.8403401604982632 0.8410919540229885
5 0.8055815067672775 0.8070402298850575
6 0.8076057012815906 0.8101532567049808
7 0.7857947059528088 0.7871168582375478
8 0.809474188525572 0.8108716475095785
9 0.7800694693975326 0.7814655172413794


In [None]:
torch.save(model.state_dict(), "cnn_model.pkl")
