## **Module**

In [1]:
from datetime import datetime
import os
import re
import gc

import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
from sklearn.manifold import Isomap
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.cluster import KMeans
from sklearn.impute import KNNImputer
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import umap.umap_ as umap

# keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping

## **UDF**

In [2]:
def cancer_select(DF, cancer_type):
    # phenotype
    phe1 = pd.read_csv("https://gdc-hub.s3.us-east-1.amazonaws.com/download/GDC-PANCAN.basic_phenotype.tsv.gz", sep="\t")
    phe1 = phe1.loc[phe1.program == "TCGA", :].loc[:, ['sample', 'sample_type', 'project_id']].drop_duplicates(['sample'])
    phe1['sample'] =  phe1.apply(lambda x : x['sample'][:-1], axis=1)
    phe2 = pd.read_csv("https://tcga-pancan-atlas-hub.s3.us-east-1.amazonaws.com/download/TCGA_phenotype_denseDataOnlyDownload.tsv.gz", sep="\t")
    ph_join = pd.merge(left = phe2 , right = phe1, how = "left", on = "sample").dropna(subset=['project_id'])
    
    if cancer_type == "PAN" or cancer_type == "PANCAN":
        filterd = ph_join.loc[ph_join['sample_type_y'] == "Primary Tumor", :]
        sample_barcode = filterd["sample"].tolist()
    else:
        filterd = ph_join.loc[(ph_join['sample_type_y'] == "Primary Tumor") & (ph_join['project_id'] == "TCGA-" + cancer_type) , :]
        sample_barcode = filterd["sample"].tolist()
        
    intersect_ = list(set(DF.columns.tolist()).intersection(sample_barcode))
    
    return intersect_

def non_zero_column(DF):
    sample_cnt = int(len(DF.columns) * 0.2)
    zero_row = dict(DF.isin([0]).sum(axis=1))
    non_remove_feature = list()

    for key, value in zero_row.items():
        if value < sample_cnt:
            non_remove_feature.append(key)
    
    return non_remove_feature


def determine_outlier_thresholds_iqr(dataframe, col_name, th1=0.05, th3=0.95):
    quartile1 = dataframe[col_name].quantile(th1)
    quartile3 = dataframe[col_name].quantile(th3)
    iqr = quartile3 - quartile1
    upper_limit = quartile3 + 1.5 * iqr
    lower_limit = quartile1 - 1.5 * iqr
    return lower_limit, upper_limit

def check_outliers_iqr(dataframe, col_name):
    lower_limit, upper_limit = determine_outlier_thresholds_iqr(dataframe, col_name)
    if dataframe[(dataframe[col_name] > upper_limit) | (dataframe[col_name] < lower_limit)].any(axis=None):
        return True
    else: 
        return False
    
def replace_with_thresholds_iqr(dataframe, cols, th1=0.05, th3=0.95, replace=False):
    from tabulate import tabulate
    data = []
    for col_name in cols:
        if col_name != 'Outcome':
            outliers_ = check_outliers_iqr(dataframe,col_name)
            count = None
            lower_limit, upper_limit = determine_outlier_thresholds_iqr(dataframe, col_name, th1, th3)
            if outliers_:
                count = dataframe[(dataframe[col_name] > upper_limit) | (dataframe[col_name] < lower_limit)][col_name].count()
                if replace: 
                    dataframe.loc[(dataframe[col_name] < lower_limit), col_name] = lower_limit
                    dataframe.loc[(dataframe[col_name] > upper_limit), col_name] = upper_limit
            outliers_status = check_outliers_iqr(dataframe, col_name)
            data.append([outliers_, outliers_status, count, col_name, lower_limit, upper_limit ])
    table = tabulate(data, headers=['Outliers (Previously)', 'Outliers', 'Count', 'Column', 'Lower Limit', 'Upper Limit'], tablefmt='rst', numalign='right')
   
    return dataframe


def get_dataset(path, cancer_type, std):
    # non_scale dataset
    # load raw DF
    DF = pd.read_csv(path + "/feature/input_feature_non_scale/" + cancer_type + ".txt", sep = "\t")
    pr_sample_DF = DF.iloc[:, 0:1]

    if cancer_type == "PANCAN":
        DF = DF.drop(["Sample_barcode"], axis=1)
    else :
        DF = DF.drop(["Project", "Sample_barcode"], axis=1)
        
    # remove outlier
    DF = replace_with_thresholds_iqr(dataframe=DF, cols=DF.columns)
    max_number_of_nas = DF.shape[0] / 2
    DF = DF.loc[:, (DF.isnull().sum(axis=0) <= max_number_of_nas)]

    # normalization
    if std:
        scalerX = StandardScaler()
        scalerX.fit(DF)
        DF_sacle = scalerX.transform(DF)
    else :
        scalerX = MinMaxScaler()
        scalerX.fit(X_train)
        DF_sacle = scalerX.transform(DF)
    
    # missing impute
    imputer = KNNImputer(n_neighbors=10)
    DF_sacle_impute = imputer.fit_transform(DF_sacle)

    DF_sacle_impute = pd.DataFrame(DF_sacle_impute, columns=DF.columns)

    return DF_sacle_impute

# z = z_mean + sqrt(var) * epsilon
def sampling(args):
    """Reparameterization trick by sampling from an isotropic unit Gaussian.
    # Arguments
        args (tensor): mean and log of variance of Q(z|X)
    # Returns
        z (tensor): sampled latent vector
    """

    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    # by default, random_normal has mean = 0 and std = 1.0
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_score(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def get_train_test_DF(x, y):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, shuffle=True, random_state=32, stratify=y)
    
    return X_train, X_test, y_train, y_test

def createFolder(path):
    try:
        if not os.path.exists(path):
            os.makedirs(path)
    except OSError:
        print ('Folder already exists. ' +  path)

## **Preprocessing**

### **File Path**

In [None]:
os.getcwd()

In [3]:
RAW_file_path = "/home/wmbio/WORK/multi-omics/HCC_integration/RAW_DATA/"
MODEL_path = "/home/jovyan/work/models/"

* **RNA gene expression**

In [6]:
rna = pd.read_csv(RAW_file_path + "tcga_RSEM_Hugo_norm_count.gz", sep = "\t", index_col=0)
rna = rna[cancer_select(DF=rna,cancer_type="LIHC")]
rna = rna[rna.index.isin(non_zero_column(rna))].T

* **miRNA**

In [8]:
mirna = pd.read_csv(RAW_file_path + "pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz", sep = "\t", index_col=0)
mirna = mirna[cancer_select(DF=mirna,cancer_type="LIHC")]
mirna = mirna[mirna.index.isin(non_zero_column(mirna))].T

* **Methylation**

In [15]:
mt = 

In [None]:
    # Methylation -> duplicate gene ->>>> average value
    mt_map <- fread(file = paste0(RAW_file_path, "probeMap_illuminaMethyl450_hg19_GPL16304_TCGAlegacy"), sep = "\t")  
    
    mt <- fread(paste0(RAW_file_path, "jhu-usc.edu_PANCAN_HumanMethylation450.betaValue_whitelisted.tsv.synapse_download_5096262.xena.gz"), check.names = T) %>%
      select(sample, any_of(cnacer_selector("LIHC"))) %>%
      left_join(x = ., y = mt_map, by = c("sample" = "#id" )) %>% 
      select(-sample, -strand) %>% 
      select(gene, chrom, chromStart, chromEnd, everything()) %>% 
      filter(gene != ".", !is.na(chrom)) %>% 
      arrange(chrom, gene) %>%
      select(-chrom, -chromStart, -chromEnd) %>%
      group_by(gene) %>% 
      summarise_if(is.numeric, mean, na.rm = TRUE) %>%
      transpose_df() %>%
      select(-any_of(zero_col_remove(.))) %>%
      rename(sample = gene)

In [10]:
mt_map = pd.read_csv(RAW_file_path + "probeMap_illuminaMethyl450_hg19_GPL16304_TCGAlegacy", sep="\t")

In [13]:
mt = pd.read_csv(RAW_file_path + "jhu-usc.edu_PANCAN_HumanMethylation450.betaValue_whitelisted.tsv.synapse_download_5096262.xena.gz")

KeyboardInterrupt: 