# CNS Drug Development

## Read data

In [1]:
from src.data.dataset import CNSDataset
from src.descriptors import DescriptorGenerator, AVAILABLE_DESCRIPTORS

from torch.utils.data import DataLoader, random_split

In [2]:
TRAIN_DATASET = "dataset\mol_train.csv"

In [3]:
TEST_DATASET = "dataset\mol_test.csv"

In [4]:
train, validation = random_split(
    CNSDataset(TRAIN_DATASET, transform=DescriptorGenerator(AVAILABLE_DESCRIPTORS)),
    [0.7, 0.3],
)

## Train model

In [5]:
from src.models import LogisticRegressionModel

In [6]:
batch_size = len(train)
train_loader = DataLoader(train, batch_size=batch_size)

In [7]:
train_dataset = next(iter(train_loader))

In [8]:
train_dataset[0].shape

torch.Size([490, 8])

In [9]:
model = LogisticRegressionModel(max_iter=1000)
model.fit(train_dataset[0].numpy(), train_dataset[1])

## Validate model

In [10]:
from torch import Tensor
from torcheval.metrics.functional import binary_f1_score
from sklearn.metrics import fbeta_score

In [11]:
batch_size = len(validation)
validation_loader = DataLoader(validation, batch_size=batch_size)

In [12]:
validation_dataset = next(iter(validation_loader))

In [13]:
result = model.predict(validation_dataset[0].numpy())

In [14]:
f"F1 score: {binary_f1_score(Tensor(result), validation_dataset[1])}"

'F1 score: 0.7397260665893555'

In [15]:
f"F2 score: {fbeta_score(result, validation_dataset[1], beta=2)}"

'F2 score: 0.733695652173913'

## Benchmark

In [16]:
from src.models import AVAILABLE_MODELS

In [17]:
for available_model in AVAILABLE_MODELS:
    model = available_model()
    model.fit(train_dataset[0].numpy(), train_dataset[1])
    result = model.predict(validation_dataset[0].numpy())
    print(available_model)
    result[result > 0.5] = 1
    result[result <= 0.5] = 0
    print(f"F2 score: {fbeta_score(result, validation_dataset[1], beta=2)}")

<class 'src.models.skmodels.LogisticRegressionModel'>
F2 score: 0.733695652173913
<class 'src.models.skmodels.LinearRegressionModel'>
F2 score: 0.734375
<class 'src.models.skmodels.RidgeRegressionModel'>
F2 score: 0.734375
<class 'src.models.skmodels.LassoRegressionModel'>
F2 score: 0.625
<class 'src.models.skmodels.ElasticNetRegressionModel'>
F2 score: 0.6009615384615385
<class 'src.models.skmodels.BayesianRidgeRegressionModel'>
F2 score: 0.7305194805194806
<class 'src.models.skmodels.SGDClassifierModel'>
F2 score: 0.4587765957446809
<class 'src.models.skmodels.KernelRidgeModel'>
F2 score: 0.6041666666666666
<class 'src.models.skmodels.SVCModel'>
F2 score: 0.6382978723404255
<class 'src.models.skmodels.KNNModel'>
F2 score: 0.59375


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


<class 'src.models.skmodels.KmeansModel'>
F2 score: 0.04807692307692308
<class 'src.models.skmodels.GaussianmixtureModel'>
F2 score: 0.0




## Test model

In [51]:
test = CNSDataset(TEST_DATASET, transform=DescriptorGenerator(AVAILABLE_DESCRIPTORS))

In [52]:
batch_size = len(test)
test_loader = DataLoader(test, batch_size=batch_size)

In [53]:
dataset = next(iter(test_loader))

In [54]:
result = model.predict(dataset[0].numpy())

In [55]:
result

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,

In [56]:
import pandas as pd

In [58]:
submission = pd.read_csv(TEST_DATASET)

In [59]:
submission['TARGET'] = result

In [60]:
submission.to_csv('submission.csv', index=False)