In [4]:
# Run these lines only the first time you run this notebook

# !wget https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/graphs/Datasets/evaluation_set_1_1_CLASH2013_paper.tsv
# !pip install gdown

Collecting gdown
  Using cached gdown-4.7.1-py3-none-any.whl (15 kB)
Collecting beautifulsoup4 (from gdown)
  Using cached beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->gdown)
  Obtaining dependency information for soupsieve>1.2 from https://files.pythonhosted.org/packages/4c/f3/038b302fdfbe3be7da016777069f26ceefe11a681055ea1f7817546508e3/soupsieve-2.5-py3-none-any.whl.metadata
  Downloading soupsieve-2.5-py3-none-any.whl.metadata (4.7 kB)
Downloading soupsieve-2.5-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, gdown
Successfully installed beautifulsoup4-4.12.2 gdown-4.7.1 soupsieve-2.5


In [5]:
# Run these lines only the first time you run this notebook

# import gdown

# url = "https://drive.google.com/file/d/1ayyD1w6SHzLS8638eoBzUX3OMq4cxSUx/view?usp=sharing"
# output = "explainability_scores_hsa-miR-106b-5p.json"
# gdown.download(url=url, output=output, quiet=False, fuzzy=True)

Downloading...
From (uriginal): https://drive.google.com/uc?id=1ayyD1w6SHzLS8638eoBzUX3OMq4cxSUx
From (redirected): https://drive.google.com/uc?id=1ayyD1w6SHzLS8638eoBzUX3OMq4cxSUx&confirm=t&uuid=3c1ed14d-0a5c-45db-bc38-c8a0e05e3548
To: /home/jovyan/miRNA/miRNA/TESTexplainability_scores_hsa-miR-106b-5p.json
100%|██████████| 1.99G/1.99G [00:10<00:00, 183MB/s] 


'TESTexplainability_scores_hsa-miR-106b-5p.json'

In [2]:
import random

random.seed(42)

In [3]:
#load the %autoreload extension using the %load_ext magic command
#Then, we set the %autoreload magic command to 2, which means that modules will be reloaded every time a cell is executed
%load_ext autoreload
%autoreload 2

In [4]:
from pytorch_lightning.loggers import CometLogger

### Get the FC scores from Bartel files   

In [5]:
import pandas as pd

mirna_FCs = pd.read_csv('modules/evaluation/mirna_fcs.csv',index_col=0, header=0, sep=',')

In [6]:
# mirna_FCs.columns.values
list(mirna_FCs)

['Gene symbol',
 'hsa-miR-16-5p',
 'hsa-miR-106b-5p',
 'hsa-miR-200a-3p',
 'hsa-miR-200b-3p',
 'hsa-miR-215-5p',
 'hsa-let-7c-5p',
 'hsa-miR-103a-3p']

### All miRNAs ready for later --- for now we skip this section and use only one RNA

In [7]:
# from utils import rna_to_dna

# mirna_sequences = ['UAGCAGCACGUAAAUAUUGGCG', 'UAAAGUGCUGACAGUGCAGAU', 'UAACACUGUCUGGUAACGAUGU', 'UAAUACUGCCUGGUAAUGAUGA', 'AUGACCUAUGAAUUGACAGAC', 'UGAGGUAGUAGGUUGUAUGGUU', 'AGCAGCAUUGUACAGGGCUAUGA']
# mirna_sequences = [rna_to_dna(x) for x in mirna_sequences]
# print(mirna_sequences)

In [8]:
# miRNA_names = ['hsa-miR-16-5p', 'hsa-miR-106b-5p', 'hsa-miR-200a-3p', 'hsa-miR-200b-3p', 'hsa-miR-215-5p', 'hsa-let-7c-5p', 'hsa-miR-103a-3p']
# miRNA_name_to_seq = {}
# for i in range(len(miRNA_names)):
#     miRNA_name_to_seq[miRNA_names[i]] = mirna_sequences[i]
# miRNA_name_to_seq

### Set the miRNA of interest

In [9]:
mirna_name = 'hsa-miR-106b-5p'
mirna_seq = 'TAAAGTGCTGACAGTGCAGAT'

### Binding sites processing

#### Collect binding sites 

In [12]:
from collect_binding_sites import collect_binding_sites

load_scores_path = "explainability_scores_{}.json".format(mirna_name)
binding_sites = collect_binding_sites(load_scores_path, mirna_seq)

In [13]:
# each item of binding_sites contains a triplet of arrays: ([starts],[ends],[lengths])
# 1st item in [starts] coresponds to 1st item in [ends] and [lengths] aswell, 2nd start to 2nd end and length, and so on
binding_sites[6:8]

[(array([2340, 4090]), array([2390, 4140]), array([50, 50])),
 (array([ 450, 1230, 2010, 2560, 2650, 5400]),
  array([ 500, 1280, 2060, 2640, 2710, 5450]),
  array([50, 50, 50, 80, 60, 50]))]

#### Transform binding sites into input data

In [14]:
from feature_extraction import count_statistics, normalize_statistics, FEATURES, FEATURE_NAMES

input_data, input_data_genes, transcripts_with_no_bs = count_statistics(binding_sites, load_scores_path, mirna_seq)

input_data_normalized = normalize_statistics(input_data)

#### Padding input data

##### Padding to 10 binding sites per sample

In [15]:
from pad_input_data import pad_features

padded_data_tensor = pad_features(input_data_normalized, pad_to_length = (len(FEATURES) * 10))

In [16]:
len(padded_data_tensor), padded_data_tensor[0].size(), padded_data_tensor[0]

(25629,
 torch.Size([40]),
 tensor([0.0015, 0.0022, 0.0086, 0.0486, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000], dtype=torch.float64))

### Get labels & Remove genes without fold change from the dataset

In [18]:
from utils import get_labels

input_labels, padded_data_tensor, input_data_genes_filtered = get_labels(mirna_name, padded_data_tensor, input_data_genes)

There is  20167 genes for which we do not have fold change because they are not in the Bartel table, out of total 25629 and 171 nan valued genes in FC table


### Split train/validation/test

#### Create test set based on what genes we can compare on with Bartel

In [19]:
# genes we can compare with Bartel are in test set

In [20]:
from dataset import split_train_test_bartel

x_train, y_train, x_val, y_val, x_test, y_test, gene_names_train, gene_names_val, gene_names_test = split_train_test_bartel(
    padded_data_tensor, 
    input_labels, 
    input_data_genes_filtered, 
    mirna_FCs,
    mirna_name
)

In [21]:
print(len(y_train), len(y_val), len(y_test))
print(len(gene_names_train), len(gene_names_val), len(gene_names_test))

4239 472 580
4239 472 580


### Create pytorch dataset

In [22]:
from dataset import get_train_dataloader, get_val_dataloader, get_test_dataloader

BATCH_SIZE = 32
train_loader = get_train_dataloader(x_train, y_train, BATCH_SIZE)
val_loader = get_val_dataloader(x_val, y_val, BATCH_SIZE)
test_loader = get_test_dataloader(x_test, y_test, BATCH_SIZE)

### comet.ml for logging online

In [23]:
comet_logger = CometLogger(
    api_key="EpKIINrla6U4B4LJhd9Sv4i0b",
    project_name="mirna",
)

CometLogger will be initialized in online mode


In [24]:
from model import Small_CNN

from pytorch_lightning import Trainer
from IPython.utils import io


model = Small_CNN(pooling='att')
# trainer = Trainer(max_epochs=1, gpus=1)  # Use GPU if available, train for X epochs
trainer = Trainer(logger=comet_logger, max_epochs=3)  # Use GPU if available, train for X epochs

# capture_output to have a cleaner notebook
# you can follow the training at the  https://www.comet.com/davidcechak/mirna/  see log of this cell
with io.capture_output() as captured:
    trainer.fit(model, train_loader, val_loader)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name         | Type              | Params
---------------------------------------------------
0 | architecture | Sequential        | 350   
1 | ce           | MSELoss           | 0     
2 | mae          | MeanAbsoluteError | 0     
3 | mse          | MeanSquaredError  | 0     
4 | r2           | R2Score           | 0     
---------------------------------------------------
350       Trainable params
0         Non-trainable params
350       Total params
0.001     Total estimated model params size (MB)
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/davidcechak/mirna/20acf22e40e44a5e9807e06cd5e51741

`Trainer.fit` stopped: `max_epochs=3` reached.
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experime

#### TODO save the model?

### Test

In [25]:
result = trainer.test(model, test_loader)

  rank_zero_warn(


Testing DataLoader 0:   0%|          | 0/19 [00:00<?, ?it/s]torch.Size([32, 1, 40]) test_step
Testing DataLoader 0:   5%|▌         | 1/19 [00:00<00:00, 344.30it/s]torch.Size([32, 1, 40]) test_step
Testing DataLoader 0:  11%|█         | 2/19 [00:00<00:00, 389.64it/s]torch.Size([32, 1, 40]) test_step
Testing DataLoader 0:  16%|█▌        | 3/19 [00:00<00:00, 414.99it/s]torch.Size([32, 1, 40]) test_step
Testing DataLoader 0:  21%|██        | 4/19 [00:00<00:00, 427.83it/s]torch.Size([32, 1, 40]) test_step
Testing DataLoader 0:  26%|██▋       | 5/19 [00:00<00:00, 326.78it/s]torch.Size([32, 1, 40]) test_step
Testing DataLoader 0:  32%|███▏      | 6/19 [00:00<00:00, 344.52it/s]torch.Size([32, 1, 40]) test_step
Testing DataLoader 0:  37%|███▋      | 7/19 [00:00<00:00, 358.92it/s]torch.Size([32, 1, 40]) test_step
Testing DataLoader 0:  42%|████▏     | 8/19 [00:00<00:00, 355.95it/s]torch.Size([32, 1, 40]) test_step
Testing DataLoader 0:  47%|████▋     | 9/19 [00:00<00:00, 366.50it/s]torch.Size([3

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/davidcechak/mirna/20acf22e40e44a5e9807e06cd5e51741



Testing DataLoader 0: 100%|██████████| 19/19 [00:01<00:00, 13.37it/s] 


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml ExistingExperiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/davidcechak/mirna/20acf22e40e44a5e9807e06cd5e51741
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     mae       : 0.22644585371017456
[1;38;5;39mCOMET INFO:[0m     mse       : 0.0899789035320282
[1;38;5;39mCOMET INFO:[0m     r2        : -0.9302238821983337
[1;38;5;39mCOMET INFO:[0m     rmse      : 0.2940700352191925
[1;38;5;39mCOMET INFO:[0m     test_loss : 0.0899789035320282
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning
[1;38;5;39mCOMET INFO

#### TODO fix R^2, should be <0,1>, not negative https://torchmetrics.readthedocs.io/en/stable/regression/r2_score.html

### Predict

In [26]:
from dataset import predict

gene_to_predictions, predictions = predict(model, x_test, gene_names_test)
print(list(gene_to_predictions.items())[:2])

[('PSEN1', 0.023890972137451172), ('LAMA3', 0.023361869156360626)]


In [27]:
import numpy as np

results = {}
results['model'] = result[0]

# computes correlation of model predictions and true labels
model_corr = np.corrcoef(predictions, y_test)[0][1]
results['model']['corr'] = model_corr

print('Model metrics: ')
print(results)

Model metrics: 
[{'test_loss': 0.0899789035320282, 'mse': 0.0899789035320282, 'mae': 0.22644585371017456, 'r2': -0.9302238821983337, 'rmse': 0.2940700352191925, 'corr': 0.013370234198257154}]
corr 0.013370234198257154


### Compare with baselines

In [37]:
def print_baseline_metrics(all_results, baseline_name):
    print('Baseline metrics: ')
    print(all_results[baseline_name])
    print('\n MAE: Our prediction is better by ', all_results[baseline_name]['mae'] - all_results['model']['mae'], ' our MAE: ', all_results['model']['mae'], '; baseline MAE: ', all_results[baseline_name]['mae'])

#### Baseline #1 mean of the training dataset labels

In [38]:
from utils import get_baseline_metrics
from statistics import mean

# Baseline: mean of the training dataset labels
baseline_name = 'mean_baseline'
train_x_mean = mean(y_train)
baseline_mean = np.full((len(y_test),), train_x_mean)
results[baseline_name] = get_baseline_metrics(baseline_mean, y_test)

print_baseline_metrics(results, baseline_name)

Baseline metrics: 
{'mse': 0.09032236039638519, 'mae': 0.2273177057504654, 'r2': -0.6783593893051147, 'rmse': 0.30053678154945374, 'corr': None}

 MAE: Our prediction is better by  0.0008718520402908325  our MAE:  0.22644585371017456 ; baseline MAE:  0.2273177057504654


#### Baseline #2 random in range(min_y_tran, max_y_train) of the training dataset labels

In [39]:
# Baseline: for each test sample returns a random item in range(min_y_tran, max_y_train) of the training dataset labels
baseline_name = 'mean_rnd'
baseline_max = max(y_train)
baseline_min = min(y_train)
np.random.seed(42)
print('min, max :', baseline_min, baseline_max)
baseline_rnd = np.random.uniform(baseline_min, baseline_max, [len(y_test)])
results[baseline_name] = get_baseline_metrics(baseline_rnd, y_test)

print_baseline_metrics(results, baseline_name)

min, max : -0.983 0.961
Baseline metrics: 
{'mse': 0.4174586534500122, 'mae': 0.5432375073432922, 'r2': -6.757167339324951, 'rmse': 0.6461104154586792, 'corr': 0.01458594582629234}

 MAE: Our prediction is better by  0.3167916536331177  our MAE:  0.22644585371017456 ; baseline MAE:  0.5432375073432922


### Log

In [41]:
import json

with open('results.json', 'w') as fp:
    json.dump(results, fp)

### Compare with Bartel - correlation and top predictions plot (i.e. genes with highest predicted FC plot)

### TODO metrics and comparison plots

In [2]:
# !conda list --explicit > spec-file.txt
!pip freeze > requirements.txt