# Course 27666 AI-guided Protein Science

# Graph Machine Learning

# HIV Inhibitor Prediction

## A Graph-level Task

Human immunodeficiency virus type 1 (HIV-1) is the most common cause of Acquired Immunodeficiency Syndrome (AIDS). One ongoing area of research is finding compounds that inhibit HIV-1 viral replication.

![HIV](https://ars.els-cdn.com/content/image/1-s2.0-S0968089619306704-ga1.jpg)



Using dataaset from: [dgl](https://www.dgl.ai/)

In [89]:
import pandas as pd

In [90]:
hiv_dataset = pd.read_csv('../data/hiv_dgl.tsv', sep='\t')

In [91]:
hiv_dataset.head()

Unnamed: 0,smiles,HIV_active
0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,0
1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,0
2,CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21,0
3,Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1,0
4,O=S(=O)(O)CCS(=O)(=O)O,0


## Using SMILES Graph to Predict HIV Inhibition

We will make use of [Graphein](https://graphein.ai/) to convert SMILES into a Molecular Graph making sure to collect molecular features that can be incorporated to the prediction model.

In [92]:
import graphein.molecule as gm
from functools import partial

In [93]:
config = gm.MoleculeGraphConfig(
    node_metadata_functions=[
        gm.atom_type_one_hot,
        gm.atomic_mass,
        gm.degree,
        gm.total_degree,
        gm.total_valence,
        gm.explicit_valence,
        gm.implicit_valence,
        gm.num_explicit_h,
        gm.num_implicit_h,
        gm.total_num_h,
        gm.num_radical_electrons,
        gm.formal_charge,
        gm.hybridization,
        gm.is_aromatic,
        gm.is_isotope,
        gm.is_ring,
        gm.chiral_tag,
        partial(gm.is_ring_size, ring_size=5),
        partial(gm.is_ring_size, ring_size=7)
    ]
)


In [94]:
graphs = []
labels = []
for index,row in hiv_dataset[['smiles','HIV_active']].iterrows():
    try:
        graph = gm.construct_graph(smiles=row["smiles"], config=config)
        graphs.append(graph)
        labels.append(row["HIV_active"])
    except:
        pass        

[03:44:08] Explicit valence for atom # 3 Al, 6, is greater than permitted
[03:44:08] Explicit valence for atom # 5 B, 5, is greater than permitted
[03:44:21] Explicit valence for atom # 16 Al, 9, is greater than permitted
[03:44:23] Explicit valence for atom # 4 Al, 9, is greater than permitted
[03:44:38] Explicit valence for atom # 12 Al, 7, is greater than permitted
[03:44:38] Explicit valence for atom # 13 Al, 7, is greater than permitted
[03:44:41] Explicit valence for atom # 6 Ge, 5, is greater than permitted


### Multiple Graphs

In [95]:
graphs[0].nodes(data=True)

NodeDataView({'C:0': {'atomic_num': 6, 'element': 'C', 'rdmol_atom': <rdkit.Chem.rdchem.Atom object at 0x3fd045150>, 'coords': None, 'atom_type_one_hot': array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'mass': 12.011, 'degree': 1, 'total_degree': 4, 'total_valence': 4, 'explicit_valence': 1, 'implicit_valence': 3, 'num_explicit_h': 0, 'num_implicit_h': 3, 'total_num_h': 3, 'num_radical_electrons': 0, 'formal_charge': 0, 'hybridization': rdkit.Chem.rdchem.HybridizationType.SP3, 'is_aromatic': False, 'is_isotope': 0, 'is_ring': False, 'chiral_tag': rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED, 'is_ring_5': False, 'is_ring_7': False}, 'C:1': {'atomic_num': 6, 'element': 'C', 'rdmol_atom': <rdkit.Chem.rdchem.Atom object at 0x3fd045460>, 'coords': None, 'atom_type_one_hot': array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'mass': 12.011, 'degree': 2, 'total_degree': 4, 'total_valence': 4, 'explicit_valence': 2, 'implicit_valence': 2, 'num_explicit_h': 0, 'num_implicit_h': 2, 'total_num_h': 2, 'num_radic

### Converting to PyG Datasets

In [81]:
import networkx as nx
from torch_geometric.data import Data, DataLoader
from torch.utils.data import random_split

In [82]:
pygraphs = []
for i, g in enumerate(graphs):
    mapping = {n:int(n.split(":")[1]) for n in g.nodes()}
    g = nx.relabel_nodes(g, mapping)
    source_nodes = [n1 for n1,n2 in list(g.edges())]
    target_nodes = [n2 for n1,n2 in list(g.edges())]
    edge_indices = torch.tensor([source_nodes,
                                 target_nodes],
                                dtype=torch.long)
    atom_features = torch.tensor([[g.nodes[node]["atomic_num"]] for node in g.nodes()], dtype=torch.float)
    # Create graph data
    data = Data(x=atom_features, edge_index=edge_indices)
    pygraphs.append(data)

### Adding the Labels HIV_activity

In [83]:
for i, g in enumerate(pygraphs):
    g.y = torch.tensor([labels[i]], dtype=torch.long)

### Splitting into Training and Test

In [96]:
# Split dataset into training and test sets
train_size = int(0.7 * len(graphs))
test_size = len(graphs) - train_size
train_dataset, test_dataset = random_split(pygraphs, [train_size, test_size])

### Create Dataloaders

Memory efficient.

In [None]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### Define GNN Model

In [97]:
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, output_dim) # maps the learned graph-level feature representations to the output space, enabling prediction tasks.

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = global_mean_pool(x, batch) # Essential for aggregating node-level features into a single graph-level feature vector.
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

# Initialize model, loss, and optimizer
model = GNN(input_dim=1, hidden_dim=16, output_dim=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

### Training Loop

In [98]:
for epoch in range(10):
    model.train()
    total_loss = 0
    for data in train_loader:
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.batch)
        loss = F.nll_loss(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 163.6136
Epoch 2, Loss: 158.2353
Epoch 3, Loss: 156.9965
Epoch 4, Loss: 156.5264
Epoch 5, Loss: 156.3342
Epoch 6, Loss: 156.6514
Epoch 7, Loss: 156.3196
Epoch 8, Loss: 156.4906
Epoch 9, Loss: 156.5302
Epoch 10, Loss: 156.2864


### Model Evaluation

In [99]:
model.eval()
correct = 0
for data in test_loader:
    out = model(data.x, data.edge_index, data.batch)
    pred = out.argmax(dim=1)
    correct += (pred == data.y).sum().item()

accuracy = correct / len(test_dataset)
print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.64
