### Notes (open tasks):

- dAUC-PR metrik
- for overview move train functions to utils  

In [1]:
seed = 42

import os
import numpy as np
import pandas as pd

import torch as torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from sklearn.metrics import accuracy_score, f1_score, average_precision_score

from fs_model import Model
from dataset_modul import Dataset
from preprocessing import preprocessing
from utils import data_split, acc_f1, flatten,train_rf, auc_pr,train_model

In [2]:
# use cuda
if torch.cuda.is_available():
    device_id = torch.cuda.current_device()
    device = torch.device('cuda:%d' % device_id)
    print(torch.cuda.get_device_name(device_id))
else:
    device = torch.device('cpu')

NVIDIA GeForce RTX 4060 Ti


# 1. Load Dataset

In [18]:
# SIDER Dataset

# 1.5 k mols, 27 tasks, 39,956 measurements

sider = pd.read_csv("datasets/sider.csv")

print('Shape: ', sider.shape)

vals = sider.values.flatten()
print('Measurments: ', len([v for v in vals if str(v) != 'nan']))

sider.head() 

Shape:  (1427, 28)
Measurments:  39956


Unnamed: 0,smiles,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,...,"Congenital, familial and genetic disorders",Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications"
0,C(CNCCNCCNCCN)N,1,1,0,0,1,1,1,0,0,...,0,0,1,1,0,0,1,1,1,0
1,CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=...,0,1,0,0,1,1,1,0,0,...,0,1,1,0,0,0,1,0,1,0
2,CC[C@]12CC(=C)[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O...,0,1,0,1,1,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
3,CCC12CC(=C)C3C(C1CC[C@]2(C#C)O)CCC4=CC(=O)CCC34,1,1,0,1,1,1,1,0,1,...,1,1,1,1,1,1,0,0,1,1
4,C1C(C2=CC=CC=C2N(C3=CC=CC=C31)C(=O)N)O,1,1,0,1,1,1,1,0,1,...,0,1,1,1,0,0,1,0,1,0


### 1.1 Create Triplet-DF

In [4]:
# mol-id, target-id (unchanged as in sider-df), target (1 or 0)
 
triplets = [(i,j,row.iloc[j]) for i,row in sider.iterrows() for j in range(1,len(row))]

# Creating Triplet-DataFrame

triplet_df = pd.DataFrame(triplets, columns=['mol_id', 'target_id', 'label']) 
print(triplet_df.shape)

# shape == 38520 (sider_rows * tasks, new cols)

#triplet_df.head()
#triplet_df.iloc[25:30] # nTasks = 27 

(38529, 3)


# 2. Preprocessing

In [5]:
# preprocessed data

data = preprocessing(sider)

nMols: 1427
... processing mol 0 of 1427
... processing mol 400 of 1427
... processing mol 800 of 1427
... processing mol 1200 of 1427
... done
min and max quantil: (0.000700770847932726, 1.0)
min and max of data after scaling: (-7.98318663568334, 37.762415178150384)
data.shape: (1427, 2248)


# 3. Train-Split And Dataloader

### 3.1 Dataset

In [6]:
# dataset_modul.py

### 3.2 Train-Val-Test-Split

In [7]:
train_triplet, val_triplet, test_triplet = data_split(triplet_df,seed)
#test_triplet['target_id'].value_counts()

Train set length: 22832
Validation set length: 7135
Test set length: 8562


In [8]:
#test_triplet

### 3.3 Define Sets

In [9]:
np.random.seed(seed)

train_set = Dataset(data, sider, train_triplet, supp=8)
val_set   = Dataset(data, sider, val_triplet, supp=8, train = False)
test_set  = Dataset(data, sider, test_triplet, supp=8, train = False)

### 3.4 Define Dataloader

In [10]:
torch.manual_seed(seed)

BATCH_SIZE = 32 # 16,32,64

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_set , batch_size=BATCH_SIZE, shuffle=True)

# 4. Model

In [11]:
# fs_model.py

# 5.Training

In [12]:
# Todo Hyperparameter search

model = Model(2248,1124,2,0.1) 

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) # adam.w with weight decay

model.to(device)

Model(
  (encoder): Sequential(
    (0): Linear(in_features=2248, out_features=2248, bias=True)
    (1): SELU()
    (2): AlphaDropout(p=0.1, inplace=False)
    (3): Linear(in_features=2248, out_features=2248, bias=True)
    (4): SELU()
    (5): AlphaDropout(p=0.1, inplace=False)
    (6): Linear(in_features=2248, out_features=1124, bias=True)
  )
  (ln): LayerNorm((1124,), eps=1e-05, elementwise_affine=False)
  (cosine): CosineSimilarity()
)

In [13]:
config = {
    "model": model,
    "criterion": criterion,
    "optimizer": optimizer,
    "max_epochs": 50,
    "patience": 4,
    "log_interval": 100,
    "save_path": f"{os.path.join('models', 'model')}.mdl"
}

writer = SummaryWriter()
train_model(config, writer, train_loader, val_loader, device, BATCH_SIZE)

epoch:8/50 batches:0700/713 avg-loss:0.43340960144996643 val-auc_pr:0.7363031467137865                                                                                                                                                                                                                                                                                                                          Finished training...


# 6. Evalutation on Test-Set

In [14]:
# acc + f1

# load best model
model.load_state_dict(torch.load(config["save_path"]))
model.train(False)

# evaluate model on valset and testset
val_acc,val_f1   = acc_f1(model,val_loader,device)
test_acc,test_f1 = acc_f1(model,test_loader,device)

print(f"Accuracy on val-set: {val_acc:.4f}")
print(f"F1-Score on val-set: {val_f1:.4f}\n")
print(f"Accuracy on test-set: {test_acc:.4f}")
print(f"F1-Score on test-set: {test_f1:.4f}")

Accuracy on val-set: 0.7330
F1-Score on val-set: 0.8029

Accuracy on test-set: 0.7614
F1-Score on test-set: 0.7764


In [15]:
# evaluate model on valset and testset
val_aucPR  = auc_pr(model,val_loader,device)
test_aucPR  = auc_pr(model,test_loader,device)

print(f"AUC-PR on val-set: {val_aucPR:.4f}")
print(f"AUC-PR on test-set: {test_aucPR:.4f}\n")

AUC-PR on val-set: 0.7742
AUC-PR on test-set: 0.7951



# 7. Compare to Baseline

### RF Training

In [16]:
y_hats_proba, y_hats_class,true_labels = train_rf(sider, data, test_triplet, seed, 1000, shuffle = True)

### Evaluate

In [17]:
predictions, targets = flatten(y_hats_class), flatten(true_labels)

# F1 Score
f1 = f1_score(targets, predictions)

# Calculate accuracy
accuracy = accuracy_score(targets, predictions)

# Calc AUC-PR
aucPR = average_precision_score(targets, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-PR Score: {aucPR:.4f}")

Accuracy: 0.5184
F1 Score: 0.5866
AUC-PR Score: 0.6005
