# The Human Protein Atlas (THPA) Celline (RNA-seq)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Downloaded: 04-2017 <br/>
Data Source: http://www.proteinatlas.org/about/download

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/The_Human_Protaein_Atlas/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Data/rna_celline.csv.zip', sep=',')

In [4]:
df.head()

Unnamed: 0,Gene,Gene name,Sample,Value,Unit
0,ENSG00000000003,TSPAN6,A-431,27.8,TPM
1,ENSG00000000003,TSPAN6,A549,37.6,TPM
2,ENSG00000000003,TSPAN6,AF22,108.2,TPM
3,ENSG00000000003,TSPAN6,AN3-CA,51.8,TPM
4,ENSG00000000003,TSPAN6,ASC TERT1,17.8,TPM


In [5]:
df.shape

(1099168, 5)

# Select Only Relevent Data 

In [6]:
df = df[['Gene name', 'Sample', 'Value']]

In [7]:
df.head()

Unnamed: 0,Gene name,Sample,Value
0,TSPAN6,A-431,27.8
1,TSPAN6,A549,37.6
2,TSPAN6,AF22,108.2
3,TSPAN6,AN3-CA,51.8
4,TSPAN6,ASC TERT1,17.8


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [8]:
df.set_index('Gene name', inplace=True)

In [9]:
uf.mapgenesymbols(df)

Progeres: 100%  1099168 Out of 1099168   

In [10]:
df.shape

(1069488, 2)

# Merge Duplicates

In [11]:
df.reset_index(inplace=True)

In [12]:
df = df.groupby(['Sample', 'Gene name']).mean()

In [13]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Value
Sample,Gene name,Unnamed: 2_level_1
A-431,A1BG,0.0
A-431,A1CF,0.0
A-431,A2M,0.3
A-431,A2ML1,0.0
A-431,A3GALT2,0.0


In [14]:
df.shape

(1063328, 1)

# Create Matrix

In [16]:
df.reset_index(inplace=True)

In [17]:
df.head()

Unnamed: 0,Sample,Gene name,Value
0,A-431,A1BG,0.0
1,A-431,A1CF,0.0
2,A-431,A2M,0.3
3,A-431,A2ML1,0.0
4,A-431,A3GALT2,0.0


In [18]:
genes = df['Gene name'].unique().tolist()
tissues = df['Sample'].unique().tolist()

df.set_index('Sample', inplace=True)

matrix = pd.DataFrame(index=genes, columns=tissues)

matrix.replace(np.nan, 0, inplace=True)

for i,col in enumerate(matrix.columns):
    
    progress = ((i+1)/len(matrix.columns))*100
        
    sys.stdout.write("Progress: %d%%   \r" % (progress))
    sys.stdout.flush()
    
    matrix.ix[df.loc[col, 'Gene name'].values.tolist(), col] = df.ix[col, 'Value'].values.tolist()

Progress: 100%   

In [19]:
matrix.head()

Unnamed: 0,A-431,A549,AF22,AN3-CA,ASC TERT1,BEWO,BJ,BJ hTERT+,BJ hTERT+ SV40 Large T+,BJ hTERT+ SV40 Large T+ RasG12V,...,U-2 OS,U-2197,U-251 MG,U-266/70,U-266/84,U-698,U-87 MG,U-937,WM-115,hTCEpi
A1BG,0.0,0.3,0.0,0.5,0.7,0.3,0.8,0.1,0.5,0.1,...,0.1,0.0,0.0,1.7,0.8,0.5,0.0,0.4,0.0,0.3
A1CF,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1
A2M,0.3,0.0,726.7,0.0,10.8,2.0,75.7,19.9,0.1,0.0,...,6.5,1.3,0.3,0.5,0.2,0.5,0.2,0.3,14.5,0.0
A2ML1,0.0,0.0,0.1,0.1,0.0,2.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.2,0.0,3.9
A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0


In [20]:
matrix.shape

(18988, 56)

## Drop Any Genes That Have Zero Expression Across 95% Of The Samples

In [21]:
matrix.replace(0, np.nan, inplace=True)

In [22]:
matrix.dropna(thresh=(0.05*matrix.shape[1]), axis=0, inplace=True)

In [23]:
matrix.replace(np.nan, 0, inplace=True)

In [24]:
matrix.shape

(17661, 56)

## Save Unfiltered Sample Matrix To File

In [25]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_celline_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Filtered Matrix

## Normalize Matrix (Quantile Normalize the matrix for the columns)

In [27]:
normalized_matrix = matrix.copy()

In [28]:
normalized_matrix = uf.quantileNormalize(normalized_matrix)

Step 2/2 progress: 100%  56 Out of 56   

In [29]:
normalized_matrix.head()

Unnamed: 0,A-431,A549,AF22,AN3-CA,ASC TERT1,BEWO,BJ,BJ hTERT+,BJ hTERT+ SV40 Large T+,BJ hTERT+ SV40 Large T+ RasG12V,...,U-2 OS,U-2197,U-251 MG,U-266/70,U-266/84,U-698,U-87 MG,U-937,WM-115,hTCEpi
A1BG,0.0,0.191071,0.0,0.678571,0.830357,0.166071,0.682143,0.110714,0.451786,0.069643,...,0.001786,0.0,0.0,1.926786,1.385714,1.676786,0.0,0.726786,0.0,0.325
A1CF,0.0,0.191071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.092857
A2M,0.366071,0.0,739.746429,0.0,12.926786,1.601786,72.577679,19.2375,0.075893,0.0,...,4.555357,1.326786,0.369643,0.504464,0.307143,1.676786,0.103571,0.5375,12.247321,0.0
A2ML1,0.0,0.0,0.017857,0.099107,0.0,1.9375,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.614286,0.0,0.332143,0.0,4.5125
A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069643,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108929,0.0


## Nomalize Matrix (z-score the rows)

In [30]:
uf.zscore(normalized_matrix, 'row')

Progress: 100%  17661 Out of 17661   

In [31]:
normalized_matrix.head()

Unnamed: 0,A-431,A549,AF22,AN3-CA,ASC TERT1,BEWO,BJ,BJ hTERT+,BJ hTERT+ SV40 Large T+,BJ hTERT+ SV40 Large T+ RasG12V,...,U-2 OS,U-2197,U-251 MG,U-266/70,U-266/84,U-698,U-87 MG,U-937,WM-115,hTCEpi
A1BG,-0.7294,-0.463696,-0.7294,0.214222,0.425295,-0.498461,0.219188,-0.575441,-0.101147,-0.632555,...,-0.726917,-0.7294,-0.7294,1.949988,1.197574,1.602338,-0.7294,0.281268,-0.7294,-0.277455
A1CF,-0.180744,-0.153027,-0.180744,-0.180744,-0.180744,-0.180744,-0.180744,-0.180744,-0.180744,-0.180744,...,-0.180744,-0.180744,-0.180744,-0.180744,-0.180744,-0.180744,-0.177117,-0.180744,-0.180744,-0.167274
A2M,-0.228259,-0.23147,6.255525,-0.23147,-0.118112,-0.217423,0.40498,-0.062772,-0.230804,-0.23147,...,-0.191523,-0.219835,-0.228228,-0.227046,-0.228776,-0.216766,-0.230561,-0.226756,-0.12407,-0.23147
A2ML1,-0.322272,-0.322272,-0.313578,-0.274025,-0.322272,0.620924,-0.322272,-0.322272,-0.322272,-0.322272,...,-0.322272,-0.322272,-0.322272,-0.322272,-0.322272,-0.023231,-0.322272,-0.160581,-0.322272,1.874462
A3GALT2,-0.210602,-0.210602,-0.210602,-0.210602,-0.210602,-0.210602,-0.210602,-0.210602,-0.210602,0.246155,...,-0.210602,-0.210602,-0.210602,-0.210602,-0.210602,-0.210602,-0.210602,-0.210602,0.503813,-0.210602


## Save Filtered Sample Matrix To File

In [32]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_celline_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [33]:
tertiary_matrix = uf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  56 Out of 56   

In [34]:
tertiary_matrix.head()

Unnamed: 0,A-431,A549,AF22,AN3-CA,ASC TERT1,BEWO,BJ,BJ hTERT+,BJ hTERT+ SV40 Large T+,BJ hTERT+ SV40 Large T+ RasG12V,...,U-2 OS,U-2197,U-251 MG,U-266/70,U-266/84,U-698,U-87 MG,U-937,WM-115,hTCEpi
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save Teriary Matrix

In [35]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_celline_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

#### Path to output files

In [38]:
path = '/Users/moshesilverstein/Documents/Harmonizome/The_Human_Protaein_Atlas/Output/'

# Create Up Gene Set Library

In [39]:
name = 'thpa_rns-seq_celline_gene_up_set'

In [40]:
uf.createUpGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  56 Out of 56   

# Create Down Gene Set Library

In [41]:
name = 'thpa_rns-seq_celline_gene_down_set'

In [42]:
uf.createDownGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  56 Out of 56   

# Create Up Attribute Library

In [43]:
name = 'thpa_rns-seq_celline_attribute_up_set'

In [44]:
uf.createUpAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  17661 Out of 17661   

# Create Down Attribute Library

In [45]:
name = 'thpa_rns-seq_celline_attribute_down_set'

In [46]:
uf.createDownAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  17661 Out of 17661   

# Create Gene Similarity Matrix

In [47]:
gene_similarity_matix = uf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [48]:
gene_similarity_matix.head()

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
A1BG,1.0,0.230514,0.047891,-0.094039,-0.123331,-0.124704,-0.046527,0.269843,-0.161666,-0.070901,...,0.011945,-0.001775,0.262509,-0.020377,0.088469,-0.053963,0.001215,-0.365767,-0.024233,0.082767
A1CF,0.230514,1.0,0.415067,-0.050012,-0.038029,-0.090252,-0.062236,0.04874,0.173641,0.003305,...,-0.141371,0.087757,-0.218915,-0.016469,-0.098952,-0.128934,-0.194005,-0.044486,-0.127149,-0.198688
A2M,0.047891,0.415067,1.0,-0.068652,-0.039276,-0.138287,-0.009846,0.111951,-0.105327,-0.061735,...,0.025845,-0.064216,-0.207419,-0.067706,-0.093036,-0.148546,-0.050817,-0.093691,-0.182721,-0.014994
A2ML1,-0.094039,-0.050012,-0.068652,1.0,-0.069105,-0.146525,-0.11001,-0.06227,0.027381,-0.071883,...,-0.17256,0.183031,-0.046284,0.193202,0.166475,-0.136579,0.004915,0.071534,0.139083,-0.118003
A3GALT2,-0.123331,-0.038029,-0.039276,-0.069105,1.0,0.297305,-0.059251,-0.206715,-0.0055,-0.053044,...,0.051296,0.033872,-0.016007,0.031776,-0.037851,0.513852,0.203847,-0.004204,0.188142,-0.081822


## Save Gene Similarity Matrix 

In [49]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_celline_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [50]:
attribute_similarity_matix = uf.createSimilarityMatrix(normalized_matrix.T, 'cosine')

In [51]:
attribute_similarity_matix.head()

Unnamed: 0,A-431,A549,AF22,AN3-CA,ASC TERT1,BEWO,BJ,BJ hTERT+,BJ hTERT+ SV40 Large T+,BJ hTERT+ SV40 Large T+ RasG12V,...,U-2 OS,U-2197,U-251 MG,U-266/70,U-266/84,U-698,U-87 MG,U-937,WM-115,hTCEpi
A-431,1.0,0.0744,-0.120655,-0.098246,-0.086512,0.0103,0.060564,-0.013224,0.031058,0.032305,...,0.127012,0.031239,0.131999,-0.074573,-0.099898,-0.091456,0.045699,-0.066281,-0.005947,0.036166
A549,0.0744,1.0,-0.093416,-0.006041,-0.086017,-0.020767,-0.034442,-0.031651,0.014393,-0.054995,...,0.080486,-0.004982,0.030098,-0.09392,-0.083534,-0.172662,0.058314,-0.138416,0.00458,-0.02878
AF22,-0.120655,-0.093416,1.0,0.128882,0.035739,-0.020388,-0.010517,-0.006526,0.026108,0.055879,...,0.056144,-0.087218,-0.026634,-0.114501,-0.09485,-0.009274,-0.074473,-0.115005,0.058365,-0.088594
AN3-CA,-0.098246,-0.006041,0.128882,1.0,-0.010268,-0.030105,-0.082899,-0.041662,-0.051659,-0.021667,...,-0.014404,-0.028329,-0.042906,-0.047654,-0.023439,-0.039035,-0.047286,-0.02398,0.001132,-0.034882
ASC TERT1,-0.086512,-0.086017,0.035739,-0.010268,1.0,-0.084819,0.266065,0.434378,0.060453,0.179448,...,-0.10361,0.145279,-0.113159,-0.063952,-0.123624,-0.196939,0.240947,-0.121041,0.053257,0.196099


## Save Attribute Similarity Matrix

In [52]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_celline_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [53]:
gene_list = uf.createGeneList(normalized_matrix)

Progeres: 100%  17661 Out of 17661   

In [54]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1CF,29974
2,A2M,2
3,A2ML1,144568
4,A3GALT2,127550


In [55]:
gene_list.shape

(17661, 2)

### Save Gene List

In [56]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_celline_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [57]:
attribute_list = uf.createAttributeList(normalized_matrix)

In [58]:
attribute_list.head()

Unnamed: 0,Attributes
0,A-431
1,A549
2,AF22
3,AN3-CA
4,ASC TERT1


In [59]:
attribute_list.shape

(56, 1)

### Save Attribute List

In [60]:
filename = '~/./Documents/Harmonizome/The_Human_Protaein_Atlas/Output/thpa_rns-seq_celline_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [61]:
path = '/Users/moshesilverstein/Documents/Harmonizome/The_Human_Protaein_Atlas/Output/'

In [62]:
name = 'thpa_rns-seq_celline_gene_attribute_edge_list'

In [63]:
uf.createGeneAttributeEdgeList(tertiary_matrix, gene_list, path, name)

Progeres: 100%  56 Out of 56   

 The number of statisticaly relevent gene-attribute associations is: 197848
