In [4]:
import os
import numpy as np
import pandas as pd

In [5]:
pd.set_option('display.notebook_repr_html', True)

def _repr_latex_(self):
    return "\centering{%s}" % self.to_latex()

pd.DataFrame._repr_latex_ = _repr_latex_  # monkey patch pandas DataFrame

Normal tissue data
Expression profiles for proteins in human tissues based on immunohistochemisty using tissue micro arrays. The comma-separated file includes Ensembl gene identifier ("Gene"), tissue name ("Tissue"), annotated cell type ("Cell type"), expression value ("Level"), and the gene reliability of the expression value ("Reliability"). The data is based on The Human Protein Atlas version 16 and Ensembl version 83.38.

In [6]:
df1 = pd.read_csv('Data/normal_tissue.csv.zip', sep=',')

In [7]:
df1.head()

Unnamed: 0,Gene,Gene name,Tissue,Cell type,Level,Reliability
0,ENSG00000000003,TSPAN6,adrenal gland,glandular cells,Not detected,Uncertain
1,ENSG00000000003,TSPAN6,appendix,glandular cells,Medium,Uncertain
2,ENSG00000000003,TSPAN6,appendix,lymphoid tissue,Not detected,Uncertain
3,ENSG00000000003,TSPAN6,bone marrow,hematopoietic cells,Not detected,Uncertain
4,ENSG00000000003,TSPAN6,breast,adipocytes,Not detected,Uncertain


In [52]:
len(df1['Gene name'].unique())

12978

RNA gene data
RNA levels in 56 cell lines and 37 tissues based on RNA-seq. The comma-separated file includes Ensembl gene identifier ("Gene"), analysed sample ("Sample") and fragments per kilobase of transcript per million fragments mapped ("Value" and "Unit"). The data is based on The Human Protein Atlas version 16 and Ensembl version 83.38. 

RNA sequencing data for human tissue 

In [9]:
df2 = pd.read_csv('Data/rna_celline.csv.zip', sep=',')

In [10]:
df2.head()

Unnamed: 0,Gene,Gene name,Sample,Value,Unit
0,ENSG00000000003,TSPAN6,A-431,27.8,TPM
1,ENSG00000000003,TSPAN6,A549,37.6,TPM
2,ENSG00000000003,TSPAN6,AF22,108.2,TPM
3,ENSG00000000003,TSPAN6,AN3-CA,51.8,TPM
4,ENSG00000000003,TSPAN6,ASC TERT1,17.8,TPM


In [11]:
df2.Sample.unique().shape

(56,)

In [12]:
df2.shape

(1099168, 5)

RNA gene data RNA levels in 56 cell lines and 37 tissues based on RNA-seq. The comma-separated file includes Ensembl gene identifier ("Gene"), analysed sample ("Sample") and fragments per kilobase of transcript per million fragments mapped ("Value" and "Unit"). The data is based on The Human Protein Atlas version 16 and Ensembl version 83.38.

RNA sequencing data for human cell lines

In [13]:
df3 = pd.read_csv('Data/rna_tissue.csv.zip', sep=',')

In [14]:
df3.head()

Unnamed: 0,Gene,Gene name,Sample,Value,Unit
0,ENSG00000000003,TSPAN6,adipose tissue,31.5,TPM
1,ENSG00000000003,TSPAN6,adrenal gland,26.5,TPM
2,ENSG00000000003,TSPAN6,appendix,9.5,TPM
3,ENSG00000000003,TSPAN6,bone marrow,0.7,TPM
4,ENSG00000000003,TSPAN6,breast,53.0,TPM


In [15]:
df3.Sample.unique().shape

(37,)

In [16]:
df3.shape

(726236, 5)

Subcellular location data
Subcellular localization of proteins based on immunofluorescently stained cells. The comma-separated file includes Ensembl gene identifier ("Gene"), main subcellular location of the protein ("Main location"), other locations ("Other location"), and the gene reliability of the expression value ("Reliability"). The data is based on The Human Protein Atlas version 16 and Ensembl version 83.38.

In [17]:
df4 = pd.read_csv('Data/subcellular_location.csv.zip', sep=',')

In [18]:
df4.head()

Unnamed: 0,Gene,Gene name,Reliability,Validated,Supportive,Uncertain,Unreliable,Cell-to-cell variation intensity,Cell-to-cell variation spatial,Cell cycle dependency,GO id
0,ENSG00000000003,TSPAN6,Uncertain,,,Cytosol,,,,,Cytosol (GO:0005829)
1,ENSG00000000457,SCYL3,Unreliable,,,,Microtubules;Nuclear bodies,,,,Microtubules (GO:0015630);Nuclear bodies (GO:0...
2,ENSG00000000460,C1orf112,Uncertain,,,Mitochondria,,,,,Mitochondria (GO:0005739)
3,ENSG00000000938,FGR,Uncertain,,,Aggresome;Plasma membrane,,,,,Aggresome (GO:0016235);Plasma membrane (GO:000...
4,ENSG00000000971,CFH,Uncertain,,,Vesicles,,,,,Vesicles (GO:0043231)


Cancer tumor data
Staining profiles for proteins in human tumor tissue based on immunohistochemisty using tissue micro arrays. The comma-separated file includes Ensembl gene identifier ("Gene"), tumor name ("Tumor"), staining value ("Level"), the number of patients that stain for this staining value ("Count patients") and the total amount of patients for this tumor type ("Total patients"). The data is based on The Human Protein Atlas version 16 and Ensembl version 83.38.

In [19]:
df5 = pd.read_csv('Data/cancer.csv.zip', sep=',')

In [20]:
df5.head()

Unnamed: 0,Gene,Gene name,Tumor,Level,Count patients,Total patients
0,ENSG00000000003,TSPAN6,breast cancer,High,1,12
1,ENSG00000000003,TSPAN6,breast cancer,Medium,7,12
2,ENSG00000000003,TSPAN6,breast cancer,Low,2,12
3,ENSG00000000003,TSPAN6,breast cancer,Not detected,2,12
4,ENSG00000000003,TSPAN6,carcinoid,High,0,4


In [50]:
len(df5['Gene name'].unique())

15288

RNA isoform data
RNA levels in 56 cell lines and 37 tissues based on RNA-seq. The tab-separated file includes Ensembl gene identifier ("Gene"), Ensembl transcript identifier ("Transcript"), analysed sample ("Sample") and transcript per million ("TPM"). The data is based on The Human Protein Atlas version 16 and Ensembl version 83.38. 


Tissue

In [22]:
df6 = pd.read_csv('Data/transcript_rna_tissue.tsv.zip', sep='\t')

In [23]:
df6.head()

Unnamed: 0,ensgid,enstid,adipose tissue.V1,adipose tissue.V2,adipose tissue.V20,adipose tissue.V313,adipose tissue.V315,adrenal gland.V119,adrenal gland.V121,adrenal gland.V122,...,thyroid gland.V198,thyroid gland.V199,thyroid gland.V370,tonsil.V263,tonsil.V287,tonsil.V294,tonsil.V298,tonsil.V301,urinary bladder.V176,urinary bladder.V177
0,ENSG00000000003,ENST00000373020,27.3915,34.4958,39.608101,16.381399,29.3871,29.186001,21.6768,25.614201,...,64.079697,49.6077,72.531197,7.79577,9.39871,9.74487,14.307,10.708,20.936701,89.081703
1,ENSG00000000003,ENST00000494424,0.0,0.0,0.173174,0.0,0.0,1.3754,0.816387,0.0,...,0.0,0.0,0.216909,0.0,0.0,0.0,0.0,0.0,0.0,0.621756
2,ENSG00000000003,ENST00000496771,1.93696,1.81727,1.52333,0.240641,0.562465,2.00048,0.776128,0.928242,...,1.5436,1.14814,2.12113,0.0,0.0,0.270458,0.571232,0.170098,0.575539,1.35765
3,ENSG00000000003,ENST00000612152,1.60782,1.65983,2.63738,0.840478,0.985923,1.2597,1.07727,0.910853,...,2.60591,2.05323,2.69972,0.314084,0.267579,0.323288,0.434302,0.19339,0.910751,2.07107
4,ENSG00000000003,ENST00000614008,0.0,1.43542,0.0,0.834401,0.266975,0.0,0.0,0.0,...,0.452601,0.0,0.0,0.0,0.238278,0.0,0.051987,0.0,0.284086,1.47224


In [24]:
df6.shape

(156921, 174)

RNA isoform data
RNA levels in 56 cell lines and 37 tissues based on RNA-seq. The tab-separated file includes Ensembl gene identifier ("Gene"), Ensembl transcript identifier ("Transcript"), analysed sample ("Sample") and transcript per million ("TPM"). The data is based on The Human Protein Atlas version 16 and Ensembl version 83.38. 


Celline

In [25]:
df7 = pd.read_csv('Data/transcript_rna_celline.tsv.zip', sep='\t')

In [26]:
df7.head()

Unnamed: 0,ensgid,enstid,A-431.C35,A-431.C36,A549.C1,A549.C2,AF22.C114,AF22.C115,AN3-CA.C53,AN3-CA.C54,...,U-698.C49,U-698.C50,U-87 MG.C91,U-87 MG.C92,U-937.C33,U-937.C34,U-937.C51,U-937.C52,WM-115.C93,WM-115.C94
0,ENSG00000000003,ENST00000373020,29.4515,25.063,38.923199,35.7467,109.218002,109.218002,47.717899,52.0602,...,0.401723,0.104767,17.078899,18.0744,0.0,0.067573,0.130372,0.0,15.3006,13.797
1,ENSG00000000003,ENST00000494424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSG00000000003,ENST00000496771,0.992257,1.33353,2.19534,3.99843,3.7014,3.7014,1.13685,2.17975,...,0.0,0.0,0.505533,1.04098,0.0,0.0,0.0,0.0,0.178061,0.613054
3,ENSG00000000003,ENST00000612152,0.399004,0.325001,0.224186,0.365967,2.38697,2.38697,1.05081,1.04846,...,0.039315,0.048414,0.472912,0.714656,0.073748,0.0,0.084929,0.068934,0.239081,0.376477
4,ENSG00000000003,ENST00000614008,0.238097,0.232515,0.0,0.0,0.797062,0.797062,1.25426,0.603664,...,0.084184,0.0,0.395,0.500561,0.0,0.0,0.0,0.0,0.747933,0.365231


In [27]:
df7.shape

(156921, 117)

Meta Data for the Tissue Samples

In [28]:
df8 = pd.read_csv('Data/metadata.txt', sep='\t')

In [29]:
df8.head()

Unnamed: 0,Source Name,Comment[ENA_SAMPLE],Material Type,Characteristics[organism part],Characteristics[organism],Characteristics[sex],Characteristics[developmental stage],Protocol REF,Term Source REF,Protocol REF.1,...,Comment[technical replicate group],Comment[ENA_EXPERIMENT],Scan Name,Comment[SUBMITTED_FILE_NAME],Comment[ENA_RUN],Comment[FASTQ_URI],Comment[MD5],Comment[SPOT_LENGTH],Comment[READ_INDEX_1_BASE_COORD],Factor Value[organism part]
0,colon_8a,ERS526279,organism part,colon,Homo sapiens,female,adult,,,P-MTAB-40602,...,group 96,ERX537316,3_140328_AC3TDEACXX_P973_106_1.fastq.gz,3_140328_AC3TDEACXX_P973_106_1.fastq.gz,ERR579148,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR579/ERR5...,463d2a5e8a77d6d602717074d918a804,202,102,colon
1,colon_8a,ERS526279,organism part,colon,Homo sapiens,female,adult,,,P-MTAB-40602,...,group 96,ERX537316,3_140328_AC3TDEACXX_P973_106_2.fastq.gz,3_140328_AC3TDEACXX_P973_106_2.fastq.gz,ERR579148,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR579/ERR5...,4402c73454367ce79d801cae4d8a0194,202,102,colon
2,colon_8b,ERS526284,organism part,colon,Homo sapiens,male,adult,,,P-MTAB-40602,...,group 97,ERX537321,4_140328_AC3TDEACXX_P973_107_1.fastq.gz,4_140328_AC3TDEACXX_P973_107_1.fastq.gz,ERR579129,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR579/ERR5...,d321dba62b7ff9e04c38056be77e924f,202,102,colon
3,colon_8b,ERS526284,organism part,colon,Homo sapiens,male,adult,,,P-MTAB-40602,...,group 97,ERX537321,4_140328_AC3TDEACXX_P973_107_2.fastq.gz,4_140328_AC3TDEACXX_P973_107_2.fastq.gz,ERR579129,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR579/ERR5...,bf1f36c9002d189660761d2cba67c510,202,102,colon
4,endometrium_8a,ERS526292,organism part,endometrium,Homo sapiens,female,adult,,,P-MTAB-40602,...,group 98,ERX537329,2_140328_AC3TDEACXX_P973_104_1.fastq.gz,2_140328_AC3TDEACXX_P973_104_1.fastq.gz,ERR579123,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR579/ERR5...,dd4fc329c68803ee16b519b82eb79583,202,102,endometrium


In [30]:
df8.shape

(400, 35)

In [31]:
df8.columns

Index(['Source Name', 'Comment[ENA_SAMPLE]', 'Material Type',
       'Characteristics[organism part]', 'Characteristics[organism]',
       'Characteristics[sex]', 'Characteristics[developmental stage]',
       'Protocol REF', 'Term Source REF', 'Protocol REF.1',
       'Term Source REF.1', 'Protocol REF.2', 'Term Source REF.2',
       'Extract Name', 'Comment[LIBRARY_SELECTION]', 'Comment[LIBRARY_SOURCE]',
       'Comment[LIBRARY_STRATEGY]', 'Comment[LIBRARY_LAYOUT]',
       'Comment[ORIENTATION]', 'Comment[NOMINAL_LENGTH]', 'Protocol REF.3',
       'Term Source REF.3', 'Performer', 'Assay Name', 'Technology Type',
       'Comment[technical replicate group]', 'Comment[ENA_EXPERIMENT]',
       'Scan Name', 'Comment[SUBMITTED_FILE_NAME]', 'Comment[ENA_RUN]',
       'Comment[FASTQ_URI]', 'Comment[MD5]', 'Comment[SPOT_LENGTH]',
       'Comment[READ_INDEX_1_BASE_COORD]', 'Factor Value[organism part]'],
      dtype='object')

RNA isoform data
RNA levels in 32 tissues based on RNA-seq. The tab-separated file includes Ensembl gene identifier ("Gene"), Ensembl transcript identifier ("Transcript"), analysed sample ("Sample") and fragments per kilobase of transcript per million fragments mapped ("FPKM"). The data is based on The Human Protein Atlas version version 13 and Ensembl version 75.37.

In [34]:
df9= pd.read_csv('Data/rna_transcript.tsv.zip', sep='\t')

FileNotFoundError: [Errno 2] No such file or directory: 'Data/rna_transcript.tsv.zip'

In [None]:
df9.head()

In [None]:
df9.shape

CMA expression data.
Expression profiles for proteins based on immunohistochemisty using cell micro arrays. The comma-separated file includes antibody identifier (hpa_id), Ensembl gene identifier (ensembl_gene_id), name of cell line (celline) and the summary expression value in that cell line (summary_expression_value). The data is based on The Human Protein Atlas version 4.1 and Ensembl version 54.36.

In [36]:
df10= pd.read_csv('Data/expression_data.CMA.csv.zip', sep=',')

In [37]:
df10.head()

Unnamed: 0,hpa_id,ensembl_gene_id,celline,summary_expression_value
0,CAB000001,ENSG00000169083,A-431,negative
1,CAB000001,ENSG00000169083,A-549,negative
2,CAB000001,ENSG00000169083,AN3-CA,negative
3,CAB000001,ENSG00000169083,BEWO,negative
4,CAB000001,ENSG00000169083,CACO-2,negative


In [38]:
df10.shape

(240705, 4)

IF expression data.
Expression profiles for proteins based on immunofluorescently stained cells. The comma-separated file includes antibody identifier (hpa_id), Ensembl gene identifier (ensembl_gene_id), name of cell line (celline) and the summary expression value in that cell line (summary_expression_value). The data is based on The Human Protein Atlas version 4.1 and Ensembl version 54.36.

In [39]:
df11= pd.read_csv('Data/expression_data.IF.csv.zip', sep=',')

In [40]:
df11.head()

Unnamed: 0,hpa_id,ensembl_gene_id,celline,summary_expression_value
0,HPA000164,ENSG00000174740,A-431,moderate
1,HPA000164,ENSG00000174740,U-2 OS,not representative
2,HPA000164,ENSG00000174740,U-251MG,not representative
3,HPA000165,ENSG00000174740,A-431,moderate
4,HPA000165,ENSG00000174740,U-2 OS,moderate


In [41]:
df11.shape

(6750, 4)

TMA expression data.
Expression profiles for proteins based on immunohistochemisty using tissue micro arrays. The comma-separated file includes antibody identifier (hpa_id), Ensembl gene identifier (ensembl_gene_id), tissue name, intensity, fraction and the summary expression value. The data is based on The Human Protein Atlas version 4.1 and Ensembl version 54.36.

In [42]:
df12= pd.read_csv('Data/expression_data.TMA.csv.zip', sep=',')

In [43]:
df12.head()

Unnamed: 0,hpa_id,ensembl_gene_id,tissue,intensity,fraction,summary_expression_value
0,CAB000001,ENSG00000169083,Adrenal,weak,<25%,negative
1,CAB000001,ENSG00000169083,"Appendix, glandular cells",negative,,negative
2,CAB000001,ENSG00000169083,"Appendix, lymphocytes",negative,,negative
3,CAB000001,ENSG00000169083,"Bladder, urothelial cells",weak,<25%,negative
4,CAB000001,ENSG00000169083,Bone marrow,negative,,negative


In [44]:
df12.shape

(385710, 6)

Antibody Response Data.
Results from 12634 immunizations using different Protein Epitope Signature Tags (PrEST) as antigens to generate monospecific antibodies. The result is given as amount (mg) specific antibody obtained after immunization and affinity purification using the same specific protein fragment (PrEST) as antigen and purification partner. The comma-separated file includes: antigen id, antibody id, antigen sequence and antibody amount (mg).

In [45]:
df13 = pd.read_csv('Data/antibodyresponse.csv.zip', sep=',')

In [46]:
df13.head()

Unnamed: 0,antigen_id,antibody_id,aa_seq,amount
0,2250374,2250260,KSPSPVQGKKSPRLLCIEKVTTDKDPKEEKEEEDDSALPQEVSIAA...,0.0105
1,670140,670067,ANLLKTVVTGCSCPLLSNLGSCKGLRVKKDFLRTFYTHQELWCKAP...,0.011
2,2180028,2180132,TTIFLKFDGEPCDLSLNITWYLKSADCYNEIYNFKAEEVELYLEKL...,0.011083
3,230454,230063,PAESQSGMALKVAATVLQPLCLGESPVVMPIHMQVEGSSAPELNPN...,0.0135
4,2440684,2440477,SISYSCSAVPHQGRGFSQYGVSGSPTKSKVTSCPGCRKPLPRCALC...,0.016917


In [47]:
df13.shape

(12634, 4)