# RNA seq only. baseline model

In [6]:
!nvidia-smi

Fri Dec  6 17:56:56 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-16GB           On  | 00000000:21:00.0 Off |                    0 |
| N/A   34C    P0              37W / 250W |   1638MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE-16GB           On  | 00000000:81:00.0 Off |  

In [7]:
import pandas as pd
import numpy as np
import torch
import scripts
from functools import lru_cache
import torchmetrics
from torch import nn
import optuna

In [8]:
config = {"features" : {"fp_radius":2}, # chemical의 fingerprint 생성 radius를 2로 설정
          "optimizer": {"batch_size": 220, # 한번에 학습시킬 데이터의 양
                        "clip_norm":19, # 그라디언트 클리핑에 사용할 최대 norm 값
                        "learning_rate": 0.0004592646200179472, # 학습률
                        "stopping_patience":15}, # 개선되지 않는 epoch가 15번 이상 나오면 학습을 중단
          "model":{"embed_dim":485, # input을 embedding할 때 사용할 차원
                 "hidden_dim":696, # hidden layer의 차원
                 "dropout":0.48541242824674574, # 40퍼센트의 노드를 랜덤하게 드랍아웃 
                 "n_layers": 4, # 3개의 hidden layer를 사용
                 "norm": "batchnorm"}, # batch normalization을 사용하여 모델이 학습 중 출력 분포를 정규화하여 학습을 안정화
         "env": {"fold": 0, # 0번째 fold를 사용하여 학습. 이는 음 n_fold에 들어갈 값을 의미하는 듯 하다. 
                "device":"cuda:0", # GPU자원을 사용할 장치를 지정한다. 
                 "max_epochs": 100, # 최대 epoch 수 
                 "search_hyperparameters":False}} # hyper parameter 이미 있으니 안쓴다.

In [9]:
# rna-seq
rna_train_dataset, rna_validation_dataset, rna_test_dataset = scripts.get_data(n_fold = config["env"]["fold"],
                                                           fp_radius = config["features"]["fp_radius"],typ = "rnaseq")

In [11]:
# rna-seq model

_, rna_model = scripts.train_model(config, torch.utils.data.ConcatDataset([rna_train_dataset, rna_validation_dataset]), None, use_momentum=False)
device = torch.device(config["env"]["device"])
metrics = torchmetrics.MetricTracker(torchmetrics.MetricCollection(
    {"R_cellwise_residuals":scripts.GroupwiseMetric(metric=torchmetrics.functional.pearson_corrcoef,
                          grouping="drugs",
                          average="macro",
                          residualize=True),
    "R_cellwise":scripts.GroupwiseMetric(metric=torchmetrics.functional.pearson_corrcoef,
                          grouping="cell_lines",
                          average="macro",
                          residualize=False),
    "MSE":torchmetrics.MeanSquaredError()}))
metrics.to(device)
rna_test_dataloader = torch.utils.data.DataLoader(rna_test_dataset,
                                       batch_size=config["optimizer"]["batch_size"],
                                       drop_last=False,
                                      shuffle=False,pin_memory=True)

epoch : 0: train loss: 2.019056471817573 Smoothed R interaction (validation) None
epoch : 1: train loss: 1.5396393802758348 Smoothed R interaction (validation) None
epoch : 2: train loss: 1.4630655652955082 Smoothed R interaction (validation) None
epoch : 3: train loss: 1.3991954665543564 Smoothed R interaction (validation) None
epoch : 4: train loss: 1.3372507542555943 Smoothed R interaction (validation) None
epoch : 5: train loss: 1.293961826375742 Smoothed R interaction (validation) None
epoch : 6: train loss: 1.2486187188675435 Smoothed R interaction (validation) None
epoch : 7: train loss: 1.2132622094914705 Smoothed R interaction (validation) None
epoch : 8: train loss: 1.1802035777913624 Smoothed R interaction (validation) None
epoch : 9: train loss: 1.1517830166887442 Smoothed R interaction (validation) None
epoch : 10: train loss: 1.1216246497498454 Smoothed R interaction (validation) None
epoch : 11: train loss: 1.1059836639904121 Smoothed R interaction (validation) None
epoc

In [12]:
rna_final_metrics = scripts.evaluate_step(rna_model, rna_test_dataloader, metrics, device, save_predictions = True, model_name = "baseline", dataset_name = "rnaseq")
print(f"Only rna-seq model: {rna_final_metrics}")

  return torch.linalg.solve(A, Xy).T


Predictions saved to: results/pred_baseline_rnaseq_20241206_18:25:44_b6ac866c-ead5-41d3-9dd3-30a418937e67.csv
Only rna-seq model: {'MSE': 1.795264482498169, 'R_cellwise': 0.8906934857368469, 'R_cellwise_residuals': 0.33360373973846436}


In [18]:
from datetime import datetime

result = rna_final_metrics

model_name = "baseline_rnaseq"
result["Model"] = model_name
time = datetime.now().strftime("%Y%m%d_%H:%M")
result["Time"] = time

result_df = pd.DataFrame([result])

ev_table = pd.read_csv("results/evalutation_table.csv")
ev_table = pd.concat([ev_table, result_df], ignore_index=True)
ev_table = ev_table.drop_duplicates(subset=["Model"],keep = "last")
ev_table.head()

Unnamed: 0,MSE,R_cellwise,R_cellwise_residuals,Model,Time
1,1.795264,0.890693,0.333604,baseline_rnaseq,20241206_19:25


In [19]:
result_df.to_csv("results/evalutation_table.csv", index = False)
#ev_table.to_csv("results/evalutation_table.csv", index = False)