In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import random

import warnings
warnings.filterwarnings("ignore")

In [2]:
# load the data
df_rna = pd.read_hdf("rna_scaled.hdf")
df_dnase = pd.read_hdf("dnase_scaled.hdf")
df_gene_info = pd.read_hdf("df_gene_info.hdf")

In [3]:
# a find gene list function that accept a Dnase position, a df_RNA file, a df_gene_info data frame, and a distance number, and then
# return the list of gene names for prediction
def find_gene_list(dnase_position, df_gene_info, distance):
    dnase_ls = dnase_position.split("-")
    chr_id = dnase_ls[0]
    chr_start = int(dnase_ls[1])
    chr_end = int(dnase_ls[2])
    
    # define the interval for gene selection
    gene_chr = chr_id.replace("chr", "")
    gene_start = chr_start - distance
    gene_end = chr_end + distance
    #print(gene_chr, gene_start, gene_end)
    # find the gene list for the gene interval
    df_gene_filter = df_gene_info[df_gene_info["chr"] == gene_chr]
    df_gene_filter = df_gene_filter[df_gene_filter["start"] >= gene_start]
    df_gene_filter = df_gene_filter[df_gene_filter["start"] <= gene_end]
    gene_ls = df_gene_filter.index
    return(gene_ls)

In [13]:
#choose the 10th dnase as example
df_dnase.columns[10]

'chr1-100115518-100116996'

In [16]:
RNA_list = list(find_gene_list(df_dnase.columns[10], df_gene_info, 1000000))

In [24]:
df_input = df_rna[RNA_list]
df_input

Unnamed: 0,ENSG00000156876,ENSG00000122477,ENSG00000162688,ENSG00000137992,ENSG00000162627,ENSG00000137996,ENSG00000227034,ENSG00000079335,ENSG00000241073,ENSG00000099260,...,ENSG00000224616,ENSG00000228084,ENSG00000117620,ENSG00000234332,ENSG00000117600,ENSG00000228086,ENSG00000117598,ENSG00000156869,ENSG00000207750,ENSG00000122435
H1_BMP4_Derived_Mesendoderm_Cultured_Cells,1.675607,-0.735318,0.365903,0.319408,0.189994,0.968056,-0.149804,1.104085,1.6098,-0.793179,...,-0.417711,-0.931057,-0.224806,1.6274,0.997413,-0.553025,0.712248,0.42321,-0.087139,0.75502
Penis_Foreskin_Keratinocyte_Primary_Cells_skin02,0.37946,-0.669854,-0.852066,-1.044904,0.033333,-0.485329,4.076501,-0.712368,0.728072,-0.118082,...,-0.528925,0.819957,-0.702718,-0.745447,-0.4348,-0.526022,-0.183165,0.279947,0.419041,-1.497214
H1_BMP4_Derived_Trophoblast_Cultured_Cells,1.420718,-0.41526,0.325734,0.014072,0.129443,0.43504,-0.338612,0.910753,1.554695,-0.607365,...,0.254069,-1.069585,-0.403149,1.070477,-0.389857,-0.554078,0.369281,2.479506,-0.047572,-0.671272
Breast_vHMEC,-0.7929,-0.021341,0.528031,0.270348,0.939898,1.461064,-0.503601,-0.532315,-0.90731,0.45938,...,-1.316754,0.312845,1.153638,-0.787277,-0.588759,0.873599,-0.47307,1.001767,1.636679,-0.592714
H1_Derived_Mesenchymal_Stem_Cells,-0.355488,-0.209237,0.034038,0.315475,0.975851,1.470389,-0.050914,-0.52416,-0.225612,-0.210331,...,0.375618,0.083394,-0.329676,0.78911,0.617039,-0.534735,-0.488605,1.365102,1.381489,-0.376443
Fetal_Intestine_Small,-0.726041,0.035233,0.573811,0.293801,0.624129,0.062224,-0.386999,1.328325,-0.134674,-0.713607,...,0.096547,1.868214,2.421324,-1.330772,-0.538806,2.087309,-0.556246,-0.101609,-0.832474,0.888487
Fetal_Intestine_Large,-0.743069,0.117066,0.354649,-0.067521,0.616283,0.197695,-0.466436,0.848236,-0.376241,-0.602244,...,-0.293426,0.08449,2.005075,-1.505532,-0.421846,1.473324,-0.067366,-0.789363,-0.694504,1.217188
H1_Derived_Neuronal_Progenitor_Cultured_Cells,1.874422,-0.281872,-0.229156,0.10252,0.628949,-1.01494,-0.227594,1.42414,1.669391,-0.85948,...,0.570791,0.686148,-0.356498,0.770776,-0.36433,-0.547775,0.115102,-0.362818,-0.660724,0.097614
Psoas_Muscle,0.264557,3.521624,3.377464,1.711289,-1.915901,1.853354,-0.426403,-0.748575,0.392267,2.560651,...,2.890711,1.091008,-0.520656,1.168766,-0.380424,-0.539198,-0.545234,0.462254,-1.320637,1.176561
Gastric,-1.164751,-0.425141,-0.264796,1.956054,0.593702,0.746852,-0.487686,-0.424241,-0.906584,-0.278936,...,0.629789,-0.161821,-0.111255,1.262561,-0.525186,-0.409038,-0.547634,-0.418862,-1.787398,-0.347222


In [25]:
df_dnase.columns[10]

'chr1-100115518-100116996'

In [28]:
output = df_dnase[df_dnase.columns[10]]

In [33]:
output[output != 0] = 1
output

tissue
H1_BMP4_Derived_Mesendoderm_Cultured_Cells          1.0
Penis_Foreskin_Keratinocyte_Primary_Cells_skin02    1.0
H1_BMP4_Derived_Trophoblast_Cultured_Cells          0.0
Breast_vHMEC                                        1.0
H1_Derived_Mesenchymal_Stem_Cells                   0.0
Fetal_Intestine_Small                               0.0
Fetal_Intestine_Large                               0.0
H1_Derived_Neuronal_Progenitor_Cultured_Cells       0.0
Psoas_Muscle                                        1.0
Gastric                                             1.0
Penis_Foreskin_Fibroblast_Primary_Cells_skin01      0.0
Pancreas                                            1.0
Mobilized_CD34_Primary_Cells_Female                 0.0
Penis_Foreskin_Melanocyte_Primary_Cells_skin01      1.0
Ovary                                               1.0
Penis_Foreskin_Fibroblast_Primary_Cells_skin02      1.0
H1_Cell_Line                                        1.0
Fetal_Brain_Female                       