In [1]:
%pip install lightning_lite
import numpy as np
import anndata as ad
import pandas as pd
import torch
from lightning_lite import seed_everything
from pytorch_lightning.callbacks import EarlyStopping  # ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim

seed_everything(10, workers=True)

Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 10


10

Loading data

In [2]:
path="C:/Users/Dell/Documents/Deep_learning_for_life_science/Projekt_zaliczeniowy/DL4LS24-IDK-main/DL4LS24-IDK-main/data/train/cell_data.h5ad"
### input Your path

anndata = ad.read_h5ad(path)
anndata.layers
anndata.X = anndata.layers['exprs'] # FIX!

df=anndata.obs
counts=anndata.layers['counts']
exprs=anndata.layers['exprs']

Appending information from exprs to dataframe for prediction task

In [3]:
list_cols_counts=['counts'+str(i) for i in range(40)]
list_cols_exprs=['exprs'+str(i) for i in range(40)]
df_counts = pd.DataFrame(data=counts, columns=list_cols_counts,index=df.index)
df_exprs = pd.DataFrame(data=exprs, columns=list_cols_exprs,index=df.index)
df_counts.shape

(236791, 40)

In [4]:
df1=df.copy()
result = pd.concat([df1, df_exprs], axis=1)
print(result.shape)
print(result.columns)

df=result.copy()

(236791, 79)
Index(['image', 'sample_id', 'ObjectNumber', 'Pos_X', 'Pos_Y', 'area',
       'major_axis_length', 'minor_axis_length', 'eccentricity', 'width_px',
       'height_px', 'acquisition_id', 'SlideId', 'Study', 'Box.Description',
       'Position', 'SampleId', 'Indication', 'BatchId', 'SubBatchId', 'ROI',
       'ROIonSlide', 'includeImage', 'flag_no_cells', 'flag_no_ROI',
       'flag_total_area', 'flag_percent_covered', 'small_cell', 'celltypes',
       'flag_tumor', 'PD1_pos', 'Ki67_pos', 'cleavedPARP_pos', 'GrzB_pos',
       'tumor_patches', 'distToCells', 'CD20_patches', 'Batch', 'cell_labels',
       'exprs0', 'exprs1', 'exprs2', 'exprs3', 'exprs4', 'exprs5', 'exprs6',
       'exprs7', 'exprs8', 'exprs9', 'exprs10', 'exprs11', 'exprs12',
       'exprs13', 'exprs14', 'exprs15', 'exprs16', 'exprs17', 'exprs18',
       'exprs19', 'exprs20', 'exprs21', 'exprs22', 'exprs23', 'exprs24',
       'exprs25', 'exprs26', 'exprs27', 'exprs28', 'exprs29', 'exprs30',
       'exprs31', '

Prepering targets and removing unimportant columns

In [5]:
cols_to_remove = ['image', 'sample_id', 'ObjectNumber', 'Pos_X', 'Pos_Y', 'width_px',
       'height_px', 'acquisition_id', 'SlideId', 'Study', 'Box.Description',
       'Position', 'SampleId', 'Indication', 'BatchId', 'SubBatchId', 'ROI',
       'ROIonSlide', 'includeImage', 'flag_no_cells', 'flag_no_ROI',
       'flag_total_area', 'flag_percent_covered', 'small_cell', 'celltypes',
       'flag_tumor', 'PD1_pos', 'Ki67_pos', 'cleavedPARP_pos', 'GrzB_pos',
       'tumor_patches', 'distToCells', 'CD20_patches', 'Batch']  # List of columns to remove

# Remove columns
df = df.drop(cols_to_remove, axis=1)


In [6]:
y=df['cell_labels']
encoder = LabelEncoder()
y_encoded1 = encoder.fit_transform(y)
df_y = pd.DataFrame(data=y_encoded1, columns=['target'],index=df.index)
# print(df_y.shape, df_y, df['cell_labels'])

In [7]:
result = pd.concat([df, df_y], axis=1)
#df=result.copy()
#df = df.drop(['cell_labels'], axis=1)
df = result.copy()
df = df.drop(['cell_labels'], axis=1)

Creating a pytorch dataset from our dataframe

In [8]:
class MyDataset(Dataset):

    def __init__(self, dataframe):
        """
        Arguments:
                dataframe with our data
        """

        self.data = dataframe

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sample = self.data.iloc[idx]
        features = torch.tensor(sample.iloc[:-1].values, dtype=torch.float32)
        target = torch.tensor(int(sample.iloc[-1]), dtype=torch.long)
        return features, target

Divide the data into a train and test set, create datasets and dataloaders

In [9]:
test_proportion = 0.2
batch_size = 30

shuffled_df = df.sample(frac=1).reset_index(drop=True)
df_test = shuffled_df.iloc[:int(test_proportion * (shuffled_df.shape[0]))]
df_train = shuffled_df.iloc[int(test_proportion * (shuffled_df.shape[0])):]
print(df_test.shape, df_train.shape, shuffled_df.shape)
train_dataset = MyDataset(df_train)
test_dataset = MyDataset(df_test)

(47358, 45) (189433, 45) (236791, 45)


In [10]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

Setting pytorch device

In [33]:
if torch.cuda.is_available():
    print("CUDA is available! You can use GPU acceleration.")
else:
    print("CUDA is not available. Using CPU...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA is not available. Using CPU...


In [22]:
class Linear_block(nn.Module):
    def __init__(self, a, b, dropout):
        super().__init__()
        self.linear = nn.Linear(a, b)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout, inplace=False)
        self.batchnorm = nn.BatchNorm1d(b)
    def forward(self, x):
        x = self.linear(x)
        x = self.relu(x)
        x = self.batchnorm(x)
        x = self.dropout(x)
        return x


class MLP(nn.Module):
    def __init__(self, layers_list, dropout):
        super().__init__()
        self.my_modules = nn.ModuleList([Linear_block(layers_list[i-1], layers_list[i], dropout) for i in range(1, len(layers_list))])
        self.clasification_head = nn.Linear(layers_list[-1], 14)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        for module in self.my_modules:
            x = module(x)
        x = self.clasification_head(x)
        x = self.softmax(x)
        return x

Creating a training loop

In [34]:
def one_hot_encoding(tensor, num_classes = 14):
    # Create a zero tensor with the desired shape
    one_hot = torch.zeros(tensor.size(0), num_classes)
    # Use scatter_ to fill the one-hot tensor
    one_hot.scatter_(1, tensor.unsqueeze(1), 1)
    return one_hot

def calculate_accuracy(model, test_dataloader, device = device):
    model.eval()

    with torch.no_grad():
        corect_preds = 0
        total_preds = 0
        clases_total = torch.zeros((14))
        corect_pre_class = torch.zeros((14))
        for i, data in enumerate(test_dataloader):
            inputs, labels = data
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim = 1)
            corect = torch.sum(preds == labels)
            corect_preds += corect.item()
            total_preds += labels.shape[0]
            one_hot_labels = one_hot_encoding(labels)
            one_hot_preds = one_hot_encoding(preds)
            clases_total += torch.sum(one_hot_labels, dim = 0)
            corect_pre_class += torch.sum(one_hot_labels * one_hot_preds, dim = 0)

        print(f'Corect total: {corect_preds}, total number of all predictions: {total_preds}, accuracy: {corect_preds/total_preds}')
        print(f'Corect per class, total of number datapoints per class, accuracy per class: \n {corect_pre_class} \n {clases_total} \n {corect_pre_class/clases_total}')

def train_model(model, lr, train_dataloader, epochs, weight = None, device = device):
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    loss_fn = torch.nn.CrossEntropyLoss(weight = weight)
    model.train()
    model.to(device)

    for epoch in range(epochs):
        running_loss = 0
        for i, data in enumerate(train_dataloader):
            # Every data instance is an input + label pair
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero your gradients for every batch!
            optimizer.zero_grad()

            # Make predictions for this batch
            outputs = model(inputs).to(device)

            # Compute the loss and its gradients
            loss = loss_fn(outputs, labels)
            loss.backward()

            # Adjust learning weights
            optimizer.step()

            # Gather data and report
            running_loss += loss.item()
            if i % 1000 == 0:
                print(f'Epoch {epoch}, batch {i}, loss: {loss}')
        print(f'End of epoch {epoch}, avarage loss = {running_loss/len(train_dataloader)}')

    return model

In [42]:
weight_for_cross_entropy = torch.tensor([1, 1, 1, 1, 5, 5, 1, 1, 6, 1, 1, 1, 7, 1]).float()
# bez różnych wag dla różnych klas model miał 0% accuracy dla 4 różnych klas. W ten sposób zmuszam go do tego, żeby choć trochę klasyfikował każdą klasę

My_MLP = MLP([44, 100, 100, 100], 0.1)
My_MLP_trained = train_model(My_MLP, 0.001, train_dataloader, 2, weight = weight_for_cross_entropy)

Epoch 0, batch 0, loss: 2.6461777687072754
Epoch 0, batch 1000, loss: 1.912638783454895
Epoch 0, batch 2000, loss: 1.9587945938110352
Epoch 0, batch 3000, loss: 1.9894936084747314
Epoch 0, batch 4000, loss: 1.9851605892181396
Epoch 0, batch 5000, loss: 2.1805219650268555
Epoch 0, batch 6000, loss: 1.903768539428711
End of epoch 0, avarage loss = 1.9837992471074266
Epoch 1, batch 0, loss: 1.8371540307998657
Epoch 1, batch 1000, loss: 1.7950774431228638
Epoch 1, batch 2000, loss: 2.173959255218506
Epoch 1, batch 3000, loss: 1.9870713949203491
Epoch 1, batch 4000, loss: 1.8436479568481445
Epoch 1, batch 5000, loss: 1.8052631616592407
Epoch 1, batch 6000, loss: 2.0422000885009766
End of epoch 1, avarage loss = 1.9194675678123299


In [43]:
#z barhcnormem:, z wagami roznymi dla roznych klas
print('Accuracy on the training set:')
calculate_accuracy(My_MLP_trained, train_dataloader)
print('accuracy on the test set:')
calculate_accuracy(My_MLP_trained, test_dataloader)

Accuracy on the training set:
Corect total: 171326, total number of all predictions: 189433, accuracy: 0.9044147535012379
Corect per class, total of number datapoints per class, accuracy per class: 
 tensor([ 3628.,  4650.,  6352., 15840.,  3073.,  1694.,  9949., 16110.,   698.,
         5292.,  4093., 93461.,   953.,  5533.]) 
 tensor([ 3884.,  5439., 10214., 16861.,  3809.,  2966., 12495., 17766.,   920.,
         5813.,  4544., 97096.,  1324.,  6302.]) 
 tensor([0.9341, 0.8549, 0.6219, 0.9394, 0.8068, 0.5711, 0.7962, 0.9068, 0.7587,
        0.9104, 0.9007, 0.9626, 0.7198, 0.8780])
accuracy on the test set:
Corect total: 42829, total number of all predictions: 47358, accuracy: 0.9043667384602391
Corect per class, total of number datapoints per class, accuracy per class: 
 tensor([  909.,  1171.,  1673.,  3921.,   778.,   427.,  2496.,  3996.,   150.,
         1263.,  1062., 23405.,   247.,  1331.]) 
 tensor([  978.,  1371.,  2625.,  4181.,   959.,   733.,  3143.,  4403.,   202.,
    