In [1]:
import argparse
import os
import json
import torch
import pandas as pd

from utils.Utils import prepare_inputs_explainability, read_postar3_data
from utils.Plots import plot_boxplot_with_annotations
from utils.Config import Config
from modelsNN.modelsNN import DeepRBP
from calculate_deeplift_values import perform_deeplift_pipeline

  from .autonotebook import tqdm as notebook_tqdm


# Tutorial 2 : DeepRBP on POSTAR3 using a trained model with TCGA

In this tutorial, we will give you an example of how validate DeepRBP explainability module using a POSTAR3 binary matrix from huh7 and HepG2 tumoral liver cell lines, with TCGA Liver Hepatocellular carcinoma samples 

## Create the tissue-specific POSTAR3 file
From the already downloaded *human.txt* file containing information on CLIP experiments from POSTAR3 (to download execute *DeepRBP/data/postar3/input_data/download_script.sh* here), we are going to generate a POSTAR3 file for a specific tissue. 

To do so we are going to run the process_script.sh:
```bash
cd DeepRBP/data/postar3/input_data
bash process_script.sh
```
This shell file executes the script analyze_postar3file.py and performs the following steps:
1) Read general human POSTAR3 file*
2) Remove NaNs in RBP name
3) Describe each tissue and create a table > DeepRBP/data/postar3/input_data/df_tissue_description.txt
4) Create tissue-specific POSTAR3 file


In [2]:
file_path = './../data/postar3/input_data/results/human_Liver.txt'
postar3_liver = pd.read_csv(file_path, index_col=0)
postar3_liver.head()

Unnamed: 0,RBP_name,chromosome,start,end,raw_tissue,technique,experiment,strand,PhastCons_Score,PhyloP_Score,tissue
0,SF3A3,chr1,187892,187938,HepG2,eCLIP,ENCODE,-,0.018696,-0.186978,Liver
1,U2AF2,chr1,786667,786727,HepG2,eCLIP,ENCODE,+,0.0247,-0.020417,Liver
2,YTHDF2,chr1,819178,819200,Huh7,PAR-CLIP_PARalyzer,GSE83438_GSM2203039,+,0.003364,0.099364,Liver
3,SUGP2,chr1,830236,830282,HepG2,eCLIP,ENCODE,+,0.016152,-0.078478,Liver
4,SUGP2,chr1,830238,830290,HepG2,eCLIP,ENCODE,+,0.0175,-0.082231,Liver


## Create the tissue-specific Postar3 GxRBP binary matrix
After creating the POSTAR3 files for each tissue, which contain information about RBPs detected in specific genomic regions from CLIP experiments (in this case, liver), we generate the GxRBP matrix using the **_create_gxrbp.R_** script and these inputs, in addition to the Postar3 file:

- **_EventsFound_gencode23.txt_**: Information about the events: position, event type, name, ID, etc.
- **_Events_Regions_gc23_400nt.RData_**: Event regions


Execute this code in terminal:
```bash
cd DeepRBP/data/create_gxrbp
Rscript create_gxrbp.R
```

## Prepare input data samples for DeepRBP

In [2]:




tumor_type = 'Liver_Hepatocellular_Carcinoma'
#cell_line = 'all'
source_explain = 'TCGA'
model_selected = '1024N_2HL_8f'
path_data = '../data/TCGA_GTeX' # Path to TCGA and GTeX data folder
path_model = './model'  # Path to trained model with TCGA
path_result = './tutorial_2/results'  # Results folder
tumor2tissue = { 
            "Kidney_Chromophobe": "Kidney_embryo_GxRBP",
            "Liver_Hepatocellular_Carcinoma": ["Liver_huh7_GxRBP", "Liver_hepg2_GxRBP", "Liver_all_GxRBP"], 
            "Acute_Myeloid_Leukemia": "Myeloid_GxRBP"
            } # Choose later one
getBM = pd.read_csv('../data/extra/getBM_reduced.csv', index_col=0) 

In [3]:
print("Welcome again! You're about to perform the explainability of the DeepRBP model")
print(f"The experiments will be shown in {path_result}")
print(f"The path_data used for this explainability is: {path_data}")
print(f"The selected model to perform explainability is {model_selected}")

Welcome again! You're about to perform the explainability of the DeepRBP model
The experiments will be shown in ./tutorial_2/results
The path_data used for this explainability is: ../data/TCGA_GTeX
The selected model to perform explainability is 1024N_2HL_8f


In [4]:
with open(f'{path_model}/config.json', 'r') as file:
    config_dict = json.load(file)

# Select the tumor type to be tested in the explainability and load the final config_obj
config_dict['test_tumor_types'] = tumor_type
config_obj = Config(**config_dict)
print(f'Samples selected to perform the explainability with POSTAR3: {config_obj.test_tumor_types}')

Samples selected to perform the explainability with POSTAR3: Liver_Hepatocellular_Carcinoma


In [7]:
### 1) Prepare INPUTS to perform the in-silico validation of our DL model.
data_inputs = prepare_inputs_explainability(tumor_type, source_explain, config_obj, path_data, path_model)

[prepare_inputs_explainability] Explainability is going to be performed for: Liver_Hepatocellular_Carcinoma of source TCGA
[Utils][get_data] set_mode is test
[Utils][get_data] We are not selecting all the tumor types
['Liver_Hepatocellular_Carcinoma']
[Utils][get_data] path_raw:  ../data/TCGA_GTeX/splitted_datasets/TCGA/test/Liver_Hepatocellular_Carcinoma
[Utils][get_data] tumor_type & source columns added to this set ... -> DONE
[Utils][get_data] path_raw:  ../data/TCGA_GTeX/splitted_datasets/GTEX/test/Liver
[Utils][get_data] tumor_type & source columns added to this set ... -> DONE
[prepare_inputs_explainability] Read Test data ... -> DONE
[prepare_inputs_explainability] Select just the samples of the TCGA source and remove column 'source' & 'tumor_type'... -> DONE
[utils][get_scaled_rbp_test_data] Scale test data with the scaler used in Training ... -> DONE 
[prepare_inputs_explainability] Load the scaler and sigma vale used for SF data in model training and scale the Test data ... 

In [8]:
print(data_inputs.df_scaled_test)
print(data_inputs.test_labels)
print(data_inputs.test_gn)
print(data_inputs.device)

                 A1CF      AATF     ABCF1      ABT1     ACAA2     ACIN1  \
TCGA-DD-A39V-01   1.0  0.618212  0.124688  0.012490  0.588729  0.000000   
TCGA-DD-AAD3-01   1.0  0.249162  0.644829  0.507840  0.876843  0.001470   
TCGA-BC-A110-11   1.0  0.000000  0.087211  0.223253  1.000000  0.000000   
TCGA-ES-A2HT-11   1.0  0.000000  0.275138  0.268314  1.000000  0.000000   
TCGA-DD-A3A3-11   1.0  0.047807  0.134242  0.349566  1.000000  0.000000   
...               ...       ...       ...       ...       ...       ...   
TCGA-GJ-A9DB-01   1.0  0.343686  0.565449  0.481261  1.000000  0.373714   
TCGA-ZS-A9CF-01   1.0  0.191022  0.533718  0.732437  0.778904  0.198123   
TCGA-YA-A8S7-01   1.0  0.284323  0.387732  0.307912  0.598954  0.714615   
TCGA-BC-A69H-01   1.0  0.646556  0.244497  0.176502  0.517717  0.000000   
TCGA-RC-A6M3-01   1.0  0.329724  0.299302  0.024610  0.628679  0.607246   

                     ACTB     ACTN1     ACTN4      ADAR  ...    ZNF598  \
TCGA-DD-A39V-01  0.460036

In [14]:
### Load the trained model
model = DeepRBP(n_inputs=data_inputs.df_scaled_test.shape[1], n_outputs=data_inputs.test_labels.shape[1], config=config_obj, device=data_inputs.device)
model.load_state_dict(torch.load(path_model+'/model.pt', map_location=data_inputs.device))      
print(model)
model.eval()

You are using a model with 2 hidden layers
DeepRBP(
  (linear0): Linear(in_features=1282, out_features=1024, bias=True)
  (bn0): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (a_function0): ReLU()
  (linear1): Linear(in_features=1024, out_features=128, bias=True)
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (a_function1): ReLU()
  (final_layer): Linear(in_features=128, out_features=11462, bias=True)
)


DeepRBP(
  (linear0): Linear(in_features=1282, out_features=1024, bias=True)
  (bn0): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (a_function0): ReLU()
  (linear1): Linear(in_features=1024, out_features=128, bias=True)
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (a_function1): ReLU()
  (final_layer): Linear(in_features=128, out_features=11462, bias=True)
)

In [15]:
### 2) Load POSTAR experimenental data with GxRBP relationships
df_val_GxRBP = read_postar3_data(path_data, tumor_type, tumor2tissue, cell_line)
print(df_val_GxRBP)


[explainability][read_validation_data] The selected tissue test used to do the explainability is: Liver_Hepatocellular_Carcinoma
[explainability][read_validation_data] For this tissue you have available this Postar data: ['Liver_huh7_GxRBP', 'Liver_hepg2_GxRBP', 'Liver_all_GxRBP']
[explainability][read_validation_data] Selected cell line: Liver_all_GxRBP
[explainability][read_validation_data] Loading Postar data ...


FileNotFoundError: [Errno 2] No such file or directory: '../data/TCGA_GTeX/validation_regulation/gencode_23/create_exRBP/result/Liver_all_GxRBP.csv'

In [None]:

### 3) Perform DEEPLIFT method
print('[explainability] 3) Do DeepLIFT ... -> Initializing')
#result = calculate_deeplift_values(df_scaled_test=data_inputs.df_scaled_test, 
#                            test_labels=data_inputs.test_labels, 
#                            test_gn=data_inputs.test_gn, 
#                            model=data_inputs.model, 
#                            path_save=path_save, 
#                            df_val_GxRBP=df_val_GxRBP,
#                            path_data=path_data)

df_deeplift_scores_TxRBP, df_deeplift_scores_GxRBP = perform_deeplift_pipeline(
                                                        df_scaled_test = data_inputs.df_scaled_test, 
                                                        test_labels = data_inputs.test_labels, 
                                                        test_gn = data_inputs.test_gn, 
                                                        model = data_inputs.model, 
                                                        path_save = path_result, 
                                                        getBM = getBM, 
                                                        select_reference='knockout', 
                                                        method='tstat'
                                                        )

df_tstat_scores_GxRBP_list = result.df_tstat_scores_GxRBP_list
df_sum_scores_GxRBP_list = result.df_sum_scores_GxRBP_list
deep_studies_list = result.deep_studies_list
print('[explainability] 3) DeepLIFT ... -> DONE')
