In [None]:
#pytorch-lightning install
!pip install pytorch-lightning

In [None]:
!pip install wandb

In [3]:
#imports
import torch
import pytorch_lightning as pl
import os
import pandas as pd
from torch.utils.data import Dataset
from torch.nn import CrossEntropyLoss
import numpy as np

In [4]:
#Number of different cancer types in dataset
num_classes = len(set([x.split('.')[1] for x in os.listdir('./methyl_files')]))
print(num_classes)

12


In [5]:
#Value counts of dataset
type_list = [x.split('.')[1] for x in os.listdir('./methyl_files')]
for x in set([x.split('.')[1] for x in os.listdir('./methyl_files')]):
    print(x)
    print(type_list.count(x))

edu_KIRC
217
edu_LUAD
126
edu_UCEC
117
edu_OV
592
edu_KIRP
16
edu_BRCA
313
edu_GBM
287
edu_COAD
166
edu_LAML
140
edu_STAD
48
edu_READ
68
edu_LUSC
133


In [7]:
#Dataset object input
files_list = os.listdir('./methyl_files')
#labels_dict
labels_dict = dict(zip(set(type_list),range(12)))
print(labels_dict)

{'edu_KIRC': 0, 'edu_LUAD': 1, 'edu_UCEC': 2, 'edu_OV': 3, 'edu_KIRP': 4, 'edu_BRCA': 5, 'edu_GBM': 6, 'edu_COAD': 7, 'edu_LAML': 8, 'edu_STAD': 9, 'edu_READ': 10, 'edu_LUSC': 11}


In [8]:
#Getting gene list
temp_df = pd.read_csv("./methyl_files/" + files_list[0])  
gene_list = temp_df['Unnamed: 0'].tolist()

In [9]:
temp_df.set_index('Unnamed: 0',inplace=True)

In [62]:
lens_list = []

In [10]:
#pytorch dataset object
class MethylDataset(Dataset):
    def __init__(self,files_list,labels_dict):
        self.labels = [labels_dict[x.split('.')[1]] for x in files_list]
        self.files_list = files_list
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        temp_df = pd.read_csv("./methyl_files/" + self.files_list[idx])     
        temp_df.set_index("Unnamed: 0",inplace=True)
        diff_list = list(set(gene_list) - set(temp_df.index.tolist()))
        for gene in diff_list:
            temp_df.loc[gene] = '0'    
        avg = lambda x: sum(x)/len(x)
        temp_df['beta_values_avg'] = temp_df['beta_values'].apply(lambda x : avg([float(y) for y in x.split(';')]))
        temp_df.sort_index(inplace=True)
        list_beta = temp_df['beta_values_avg'].tolist()
        out_list = [0]*num_classes
        out_list[self.labels[idx]] = 1
        out = (torch.Tensor(list_beta), torch.Tensor(out_list))
        return out
    

In [11]:
test_dataset = MethylDataset(files_list,labels_dict)

In [12]:
F = torch.nn.BCEWithLogitsLoss()
#from torch.nn import functional as F
class MethylModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.l1 = torch.nn.Linear(17661, 1000)
        self.l2 = torch.nn.Linear(1000, 12)    

    def forward(self, x):
        X = self.l1(x)
        X1 = self.l2(X)
        return torch.sigmoid(X1)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F(y_hat, y)
        print("train_loss: ",loss)
        self.log("train_loss", loss,prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-5)

In [13]:
model = MethylModel()

In [14]:
train_ds, test_ds = torch.utils.data.random_split(test_dataset, [2000, 223])

In [15]:
train_dl = torch.utils.data.DataLoader(train_ds)
val_dl = torch.utils.data.DataLoader(test_ds)

In [16]:
wandb_logger = pl.loggers.WandbLogger()

In [17]:
trainer = pl.Trainer(log_every_n_steps=1,enable_progress_bar=True,logger=wandb_logger)
trainer.fit(model, train_dl, val_dl)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")

  | Name | Type   | Params
--------------------------------
0 | l1   | Linear | 17.7 M
1 | l2   | Linear | 12.0 K
--------------------------------
17.7 M    Trainable params
0         Non-trainable params
17.7 M    Total params
70.696    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

train_loss:  tensor(0.9331, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ec2-user/.netrc


train_loss:  tensor(0.9058, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss:  tensor(0.9042, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss:  tensor(0.8652, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss:  tensor(0.8645, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss:  tensor(0.8340, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss:  tensor(0.8214, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss:  tensor(0.8027, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss:  tensor(0.8026, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss:  tensor(0.7786, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss:  tensor(0.7731, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss:  tensor(0.7593, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss:  tensor(0.7454, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss:  tensor(0.7655, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
train_loss: 

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
