In [1]:
import pandas as pd

In [2]:
from crr_labels import fantom

In [3]:
enhancers, promoters = fantom(
    cell_lines=["GM12878"], # list of cell lines to be considered.
    window_size=256, # window size to use for the various regions.
    genome = "hg38", # considered genome version. Currently supported only "hg19".
    center_enhancers = "peak", # how to center the enhancer window, either around "peak" or the "center" of the region.
    enhancers_threshold = 0, # activation threshold for the enhancers.
    promoters_threshold = 5, # activation threshold for the promoters.
    drop_always_inactive_rows = False, # whether to drop the rows where no activation is detected for every row.
    binarize = True, # whether to return the data binary-encoded, zero for inactive, one for active.
    nrows = None # the number of rows to read, useful when testing pipelines for creating smaller datasets.
)

In [4]:
promoters[~promoters.lifted].to_csv("promoters_hg38.csv", index=False)

In [5]:
promoters[promoters.lifted]

Unnamed: 0,chrom,chromStart,chromEnd,strand,GM12878,lifted
0,chr1,564344,564600,+,0,True
1,chr1,564393,564649,+,0,True
2,chr1,565022,565278,+,0,True
3,chr1,565227,565483,+,0,True
4,chr1,565285,565541,+,0,True
...,...,...,...,...,...,...
96717,chrY,21906594,21906850,-,0,True
96718,chrY,21906623,21906879,-,0,True
96719,chrY,21906761,21907017,-,0,True
96720,chrY,23613727,23613983,-,0,True


In [6]:
!head "fantom_data/hg38_fair+new_CAGE_peaks_phase1and2_tpm_ann.osc.txt"

##ColumnVariables[00Annotation]=CAGE peak id
##ColumnVariables[short_description]=short form of the description below. Common descriptions in the long descriptions has been omited
##ColumnVariables[description]=description of the CAGE peak
##ColumnVariables[association_with_transcript]=transcript which 5end is the nearest to the the CAGE peak
##ColumnVariables[entrezgene_id]=entrezgene (genes) id associated with the transcript
##ColumnVariables[hgnc_id]=hgnc (genes) id associated with the transcript
##ColumnVariables[uniprot_id]=uniprot (protein) id associated with the transcript
##ParemeterValue[genome_assembly]=hg38
##ColumnVariables[tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode]=TPM (tags per million) of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode
##ColumnVariables[tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode]=TPM (tags per millio

In [7]:
fantom_data = pd.read_csv(
    "fantom_data/hg38_fair+new_CAGE_peaks_phase1and2_tpm_ann.osc.txt",
    sep="\t",
    comment="#",
    low_memory=False
)

In [8]:
enhancers_data = pd.read_csv(
    "fantom_data/F5.hg38.enhancers.expression.tpm.matrix.gz",
    sep="\t",
    low_memory=False
)

In [9]:
enhancers_data

Unnamed: 0.1,Unnamed: 0,CNhs11844,CNhs11251,CNhs11282,CNhs10746,CNhs11253,CNhs13053,CNhs13054,CNhs13502,CNhs13052,...,CNhs10654,CNhs10635,CNhs11766,CNhs11765,CNhs10612,CNhs13464,CNhs11676,CNhs11763,CNhs12854,CNhs12844
0,chr10:100006233-100006603,1.168411,0.0,0.000000,0.000000,0.207444,0.0,0.000000,0.0,0.0,...,0.000000,0.149924,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
1,chr10:100008181-100008444,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.075973,0.201635,0.0,0.0,0.0,0.0
2,chr10:100014348-100014634,0.000000,0.0,0.192173,0.097232,0.000000,0.0,0.000000,0.0,0.0,...,0.080305,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
3,chr10:100020065-100020562,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,1.442423,0.000000,0.604904,0.0,0.0,0.0,0.0
4,chr10:100043485-100043744,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.234773,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.075973,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63280,chrY:7520195-7520556,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.149924,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
63281,chrY:7724230-7724512,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
63282,chrY:7769899-7770218,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.455839,0.000000,0.0,0.0,0.0,0.0
63283,chrY:7796227-7796534,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


In [10]:
fantom_data[fantom_data["00Annotation"].str.contains("146017737")]

Unnamed: 0,00Annotation,short_description,description,association_with_transcript,entrezgene_id,hgnc_id,uniprot_id,tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode,tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode,tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep3.CNhs14408.13543-145H6.hg38.nobarcode,...,tpm.transitional-cell%20carcinoma%20cell%20line%3a5637.CNhs10735.10418-106C4.hg38.nobarcode,tpm.transitional-cell%20carcinoma%20cell%20line%3aJMSU1.CNhs11261.10492-107B6.hg38.nobarcode,tpm.tridermal%20teratoma%20cell%20line%3aHGRT.CNhs11828.10694-109G1.hg38.nobarcode,tpm.tubular%20adenocarcinoma%20cell%20line%3aSUIT-2.CNhs11883.10797-110I5.hg38.nobarcode,tpm.umbilical%20cord%2c%20fetal%2c%20donor1.CNhs11765.10057-101H3.hg38.nobarcode,tpm.uterus%2c%20adult%2c%20pool1.CNhs11676.10100-102D1.hg38.nobarcode,tpm.uterus%2c%20fetal%2c%20donor1.CNhs11763.10055-101H1.hg38.nobarcode,tpm.vagina%2c%20adult.CNhs12854.10204-103F6.hg38.nobarcode,tpm.vein%2c%20adult.CNhs12844.10191-103E2.hg38.nobarcode,tpm.xeroderma%20pigentosum%20b%20cell%20line%3aXPL%2017.CNhs11813.10563-108A5.hg38.nobarcode
95719,"hg19::chr8:146017737..146017748,-;hg_95721.1",p2@RPL8,CAGE_peak_2_at_RPL8_5end,"2bp_to_ENST00000528957.5,uc064rpz.1_5end",6132,HGNC:10368,P62917,28.246498,28.931535,27.971083,...,15.298796,8.461104,7.610119,9.904102,8.346005,1.706283,6.655068,1.938596,3.94557,10.681473


In [11]:
fantom_data[95717: 95720]

Unnamed: 0,00Annotation,short_description,description,association_with_transcript,entrezgene_id,hgnc_id,uniprot_id,tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep1.CNhs14406.13541-145H4.hg38.nobarcode,tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep2.CNhs14407.13542-145H5.hg38.nobarcode,tpm.293SLAM%20rinderpest%20infection%2c%2000hr%2c%20biol_rep3.CNhs14408.13543-145H6.hg38.nobarcode,...,tpm.transitional-cell%20carcinoma%20cell%20line%3a5637.CNhs10735.10418-106C4.hg38.nobarcode,tpm.transitional-cell%20carcinoma%20cell%20line%3aJMSU1.CNhs11261.10492-107B6.hg38.nobarcode,tpm.tridermal%20teratoma%20cell%20line%3aHGRT.CNhs11828.10694-109G1.hg38.nobarcode,tpm.tubular%20adenocarcinoma%20cell%20line%3aSUIT-2.CNhs11883.10797-110I5.hg38.nobarcode,tpm.umbilical%20cord%2c%20fetal%2c%20donor1.CNhs11765.10057-101H3.hg38.nobarcode,tpm.uterus%2c%20adult%2c%20pool1.CNhs11676.10100-102D1.hg38.nobarcode,tpm.uterus%2c%20fetal%2c%20donor1.CNhs11763.10055-101H1.hg38.nobarcode,tpm.vagina%2c%20adult.CNhs12854.10204-103F6.hg38.nobarcode,tpm.vein%2c%20adult.CNhs12844.10191-103E2.hg38.nobarcode,tpm.xeroderma%20pigentosum%20b%20cell%20line%3aXPL%2017.CNhs11813.10563-108A5.hg38.nobarcode
95717,"hg19::chr8:146016750..146016780,-;hg_95719.1",p3@RPL8,CAGE_peak_3_at_RPL8_5end,"494bp_to_ENST00000534781.1,uc064rpt.1_5end",6132,HGNC:10368,,3.116855,3.034776,2.700656,...,0.709216,2.19362,1.90253,0.931155,6.259504,0.511885,0.0,3.877193,3.507173,1.869258
95718,"hg19::chr8:146016823..146016858,-;hg_95720.1",p4@RPL8,CAGE_peak_4_at_RPL8_5end,-443bp_to_BC000047_5end,6132,HGNC:10368,,5.6493,4.855642,4.822601,...,0.405266,3.133742,5.707589,3.047416,2.086501,1.02377,2.047713,1.292398,5.041561,4.005552
95719,"hg19::chr8:146017737..146017748,-;hg_95721.1",p2@RPL8,CAGE_peak_2_at_RPL8_5end,"2bp_to_ENST00000528957.5,uc064rpz.1_5end",6132,HGNC:10368,P62917,28.246498,28.931535,27.971083,...,15.298796,8.461104,7.610119,9.904102,8.346005,1.706283,6.655068,1.938596,3.94557,10.681473
