In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.spatial.distance as dist
import seaborn as sns

In [2]:
%matplotlib inline

In [3]:
cwd = os.getcwd()

meta_data_file_samples = r'GTEx_Data_V6_Annotations_SampleAttributesDS.txt'
meta_data_file_subjects = r'GTEx_Data_V6_Annotations_SubjectPhenotypesDS.txt'
sybol_mapping_file = r'Homo_sapiens.gene_info.gene_info'
data_file = r'GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct'

### Creat Sample Meta Data Table

In [4]:
meta_df_SA = pd.read_csv(os.path.join(cwd, meta_data_file_samples), sep='\t', index_col=0)

In [5]:
meta_df_SP = pd.read_csv(os.path.join(cwd, meta_data_file_subjects), sep='\t')

In [6]:
# meta_df_SP.GENDER[meta_df_SP.GENDER == 1] = 'male' 
# meta_df_SP.GENDER[meta_df_SP.GENDER == 2] = 'female' 
meta_df_SP.GENDER = meta_df_SP.GENDER.where(meta_df_SP.GENDER == 1, other='female')
meta_df_SP.GENDER = meta_df_SP.GENDER.where(meta_df_SP.GENDER == 'female', other='male')

In [7]:
meta_df = pd.DataFrame(columns=['TISSUE', 'SUBJID'], index=meta_df_SA.index)
meta_df.TISSUE = meta_df_SA.SMTS
meta_df.SUBJID = [item[0] for item in meta_df.index.str.rsplit('-', 3)]

In [8]:
index = meta_df.index
meta_df = pd.merge(meta_df, meta_df_SP, how='left')
meta_df.index = index

In [9]:
meta_df.dropna(inplace=True)
meta_df.drop(['DTHHRDY','SUBJID'], axis=1, inplace=True)
meta_df.columns.name = 'ATTRIBUTES'
meta_df.index.name = 'TISSUE SAMPLE'

### Get Data

In [10]:
data = pd.read_csv(os.path.join(cwd, data_file), sep='\t' ,skiprows=2)

### Creat Gene Meta Data Table

In [11]:
sm = pd.read_csv(os.path.join(cwd, sybol_mapping_file), sep='\t')

In [12]:
sm.drop_duplicates(subset='Symbol', inplace=True)
sm = sm[['Symbol', 'GeneID']]

In [13]:
gene_df = data[['Description', 'Name']].copy()
gene_df.columns = [['Symbol', 'Ensemble Acc']]

In [14]:
gene_df = pd.merge(gene_df, sm, how='left')

In [15]:
gene_df.shape

(56238, 3)

In [16]:
gene_df.dropna(inplace=True)

In [17]:
gene_df.shape

(32219, 3)

### Creat Gene Attribute Matirx

In [18]:
data_matrix = data.copy()

In [19]:
data_matrix.shape

(56238, 8557)

In [20]:
data_matrix.rename(columns={'Description':'Symbol', 'Name': 'Ensembel Acc'}, inplace=True)

In [21]:
#drop genes for which there is no info
data_matrix = pd.merge(gene_df, data_matrix, how='left', on='Symbol')

In [22]:
data_matrix.shape

(38447, 8559)

In [23]:
#drop samples for which there is no meta data
lst = data_matrix.columns.intersection(meta_df.index)
data_matrix = data_matrix[lst]

In [24]:
data_matrix.shape

(38447, 7425)

In [25]:
data_matrix.insert(0, 'Symbol', gene_df['Symbol'])

In [26]:
data_matrix.dropna(inplace=True)

In [27]:
data_matrix.shape

(21752, 7426)

In [28]:
data_matrix.replace(to_replace=0.000000, value=np.nan, inplace=True)

In [29]:
data_matrix.dropna(axis=1, thresh=(0.05*data_matrix.index.size), inplace=True)#drop any missing at least 5% on columns
data_matrix.dropna(axis=0, thresh=(0.05*data_matrix.columns.size), inplace=True)#drop any missing at least 5% on rows

In [30]:
data_matrix.shape

(14823, 7426)

In [31]:
data_matrix.set_index('Symbol', inplace=True)

In [32]:
# impute remaining missing values
data_matrix.fillna(data_matrix.median(numeric_only=True), inplace=True);

In [33]:
# Log transform 
data_matrix = np.log2(data_matrix)

In [34]:
# functin for quantile normalization
def quantileNormalize(df_input):
    df = df_input.copy()
    #compute rank
    dic = {}
    for col in df:
        dic.update({col : sorted(df[col])})
    sorted_df = pd.DataFrame(dic)
    rank = sorted_df.mean(axis = 1).tolist()
    #sort
    for col in df:
        t = np.searchsorted(np.sort(df[col]), df[col])
        df[col] = [rank[i] for i in t]
    return df

In [35]:
data_matrix = quantileNormalize(data_matrix)

In [36]:
data_matrix = data_matrix.groupby(level=0).sum()

In [37]:
# z-score standardize the data
data_matrix = data_matrix.apply(lambda x: (x-x.mean())/x.std(ddof=0), axis=1)

In [38]:
data_matrix.shape

(14659, 7425)

### Creat Tertiary Matrix

In [39]:
vals = abs(data_matrix.values.flatten())
vals = np.sort(vals)

In [40]:
pos = abs(data_matrix) > vals[-int(0.1*data_matrix.values.size):][0] 
up = data_matrix > 0
down = data_matrix < 0

In [41]:
terup = pos & up
terup = terup.applymap(lambda x: 1 if x else np.nan)
terdown = pos & down
terdown = terdown.applymap(lambda x: -1 if x else np.nan)
terup.fillna(0, inplace=True)
terdown.fillna(0, inplace=True)

In [42]:
tertiary_df = terup + terdown

In [43]:
tertiary_df.replace(0.0, np.nan, inplace=True)
tertiary_df.dropna(axis=1, how='all', inplace=True)
tertiary_df.dropna(axis=0, how='all', inplace=True)
tertiary_df.replace(np.nan, 0, inplace=True)

### Clean Tables

In [44]:
data_matrix = data_matrix[tertiary_df.columns]
data_matrix =  data_matrix.ix[tertiary_df.index]

In [45]:
meta_df = meta_df.ix[data_matrix.columns]

In [46]:
gene_df.drop_duplicates(subset='Symbol', inplace=True)
gene_df.set_index('Symbol', inplace=True)
gene_df = gene_df.ix[data_matrix.index]

### Calculate The Cosine Attribute Matrix

In [47]:
# Check the cosine distance between the samples (attributes)
attribute_similarity_matrix = dist.pdist(data_matrix.T, 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_df = pd.DataFrame(data=attribute_similarity_matrix[0:,0:], index=data_matrix.columns, columns=data_matrix.columns)
attribute_similarity_df = attribute_similarity_df.applymap(lambda x: 1-x)

### Calculate The Cosine Gene Matrix

In [48]:
# Check the cosine distance between the genes
gene_similarity_matrix = dist.pdist(data_matrix, 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_df = pd.DataFrame(data=gene_similarity_matrix[0:,0:], index=data_matrix.index, columns=data_matrix.index)
gene_similarity_df = gene_similarity_df.applymap(lambda x: 1-x)

### Add Relevent Data To Table

In [49]:
data_matrix.insert(0, 'Ensemble Acc', gene_df['Ensemble Acc'])
data_matrix.insert(1, 'GeneID', gene_df['GeneID'])

In [50]:
tertiary_df.insert(0, 'Ensemble Acc', gene_df['Ensemble Acc'])
tertiary_df.insert(1, 'GeneID', gene_df['GeneID'])

In [51]:
attribute_similarity_df.insert(0, 'Ensemble Acc', gene_df['Ensemble Acc'])
attribute_similarity_df.insert(1, 'GeneID', gene_df['GeneID'])

In [52]:
gene_similarity_df.insert(0, 'Ensemble Acc', gene_df['Ensemble Acc'])
gene_similarity_df.insert(1, 'GeneID', gene_df['GeneID'])

### Creat Attribute Set Libraries

In [53]:
attribute_up_lib = pd.DataFrame()

up1 = pos & up

i=1

for index in up1.index:
    lst = up1.ix[index].values
    
    temp_df = pd.DataFrame(index=[index], columns=range(0, len(up1.columns[lst].values)))
    temp_df.ix[index] = up1.columns[lst].values
    attribute_up_lib = attribute_up_lib.append(temp_df)
    
    print('\r'+str(i/up1.index.size*100), end='', flush=True)
    i+=1
    

attribute_up_lib.sort_index(inplace=True)

attribute_up_lib.dropna(axis=1, how='all')
attribute_up_lib = attribute_up_lib.replace(np.nan, r' ', regex=True)

100.0

In [54]:
attribute_down_lib = pd.DataFrame()

down1 = pos & down

i=1

for index in down1.index:
    lst = down1.ix[index].values
    
    temp_df = pd.DataFrame(index=[index], columns=range(0, len(down1.columns[lst].values)))
    temp_df.ix[index] = down1.columns[lst].values
    attribute_down_lib = attribute_down_lib.append(temp_df)
    
    print('\r'+str(i/down1.index.size*100), end='', flush=True)
    i+=1
    

attribute_down_lib.sort_index(inplace=True)

attribute_down_lib.dropna(axis=1, how='all')
attribute_down_lib = attribute_down_lib.replace(np.nan, r' ', regex=True)

100.0

### Creat Gene Set Libraries

In [55]:
gene_up_lib = pd.DataFrame()

up2 = pos & up

i=1

for index in up2.T.index:
    lst = up2.T.ix[index].values
    
    temp_df = pd.DataFrame(index=[index], columns=range(0, len(up2.T.columns[lst].values)))
    temp_df.ix[index] = up2.T.columns[lst].values
    gene_up_lib = gene_up_lib.append(temp_df)
    
    print('\r'+str(i/up2.T.index.size*100), end='', flush=True)
    i+=1
    

gene_up_lib.sort_index(inplace=True)

gene_up_lib.dropna(axis=1, how='all')
gene_up_lib = gene_up_lib.replace(np.nan, r' ', regex=True)

100.0

In [56]:
gene_down_lib = pd.DataFrame()

down2 = pos & down

i=1

for index in down2.T.index:
    lst = down2.T.ix[index].values
    
    temp_df = pd.DataFrame(index=[index], columns=range(0, len(down2.T.columns[lst].values)))
    temp_df.ix[index] = down2.T.columns[lst].values
    gene_up_lib = gene_up_lib.append(temp_df)
    
    print('\r'+str(i/down2.T.index.size*100), end='', flush=True)
    i+=1
    

gene_down_lib.sort_index(inplace=True)

gene_down_lib.dropna(axis=1, how='all')
gene_down_lib = gene_up_lib.replace(np.nan, r' ', regex=True)

100.0

### Creat Gene Attribute Edges

In [57]:
# used just to get a percent done while running Creat Gene Attribute Edges
m = 0
j = 1
for index in attribute_up_lib.index:
    m += attribute_up_lib.ix[index].unique().size
    print('\r'+str(j/(attribute_up_lib.index.size)*100), end='', flush=True)
    j += 1
n = 0    
j = 1
for index in attribute_down_lib.index:
    n += attribute_down_lib.ix[index].unique().size
    print('\r'+str(j/(attribute_down_lib.index.size)*100), end='', flush=True)
    j += 1 

size = m+n

100.0

In [None]:
gene_attribute_edges = pd.DataFrame(columns=['GeneSym','Ensemble Acc', 'GeneID','Tissue Sample', 'Tissue', 'Gender', 'Age', 'Weight'])
temp = pd.Series()

i=1

for index in attribute_up_lib.index:
    for col in attribute_up_lib.ix[index].unique():
        if col != ' ':
            temp['GeneSym'] = index 
            temp['Ensemble Acc'] = gene_df.at[index, 'Ensemble Acc']
            temp['GeneID'] = gene_df.at[index, 'GeneID']
            temp['Tissue Sample'] = col
            temp['Tissue'] = meta_df.at[col, 'TISSUE']
            temp['Age'] = meta_df.at[col, 'AGE']
            temp['Gender'] = meta_df.at[col, 'GENDER']
            temp['Weight'] = 1.0
            
            gene_attribute_edges = gene_attribute_edges.append(temp, ignore_index=True)
            
            print('\r'+str(i/(size)*100), end='', flush=True)
            i+=1
            
for index in attribute_down_lib.index:
    for col in attribute_down_lib.ix[index].unique():
        if col != ' ':
            temp['GeneSym'] = index 
            temp['Ensemble Acc'] = gene_df.at[index, 'Ensemble Acc']
            temp['GeneID'] = gene_df.at[index, 'GeneID']
            temp['Tissue Sample'] = col
            temp['Tissue'] = meta_df.at[col, 'TISSUE']
            temp['Age'] = meta_df.at[col, 'AGE']
            temp['Gender'] = meta_df.at[col, 'GENDER']
            temp['Weight'] = -1.0
            
            gene_attribute_edges = gene_attribute_edges.append(temp, ignore_index=True)
            
            print('\r'+str(i/(size)*100), end='', flush=True)
            i+=1
            

0.05868812926525486

In [None]:
gene_attribute_edges.set_index('Symbol', inplace=True)

### Plot Heatmaps

In [None]:
data_matrix_vis = data_matrix.copy()

data_matrix_vis.pop('Ensemble Acc')
data_matrix_vis.pop('GeneID')
plt.pcolormesh(data_matrix_vis, cmap='seismic')
plt.axis('tight')


In [None]:
plt.pcolormesh(attribute_similarity_df, cmap='seismic')
plt.axis('tight')


In [None]:
plt.pcolormesh(gene_similarity_df, cmap='seismic')
plt.axis('tight')


### Print Tables to Files

In [59]:
meta_df.to_csv('Output/attribute_list_entries.txt', sep='\t', quotechar=' ')

In [60]:
gene_df.to_csv('Output/gene_list_terms.txt', sep='\t', quotechar=' ')

In [61]:
data_matrix.to_csv('Output/gene_attribute_matrix_standardized.txt', sep='\t', quotechar=' ')

In [62]:
tertiary_df.to_csv('Output/gene_attribute_matrix.txt', sep='\t', quotechar=' ')

In [63]:
attribute_similarity_df.to_csv('Output/attribute_similarity_matrix_cosine.txt', sep='\t', quotechar=' ')

In [64]:
gene_similarity_df.to_csv('Output/gene_similarity_matrix_cosine.txt', sep='\t', quotechar=' ')

In [65]:
gene_attribute_edges.to_csv('Output/gene_attribute_edges.txt', sep='\t')

In [71]:
attribute_down_lib.to_csv('Output/attribute_set_library_dn_crisp.gmt', sep='\t')

In [72]:
attribute_up_lib.to_csv('Output/attribute_set_library_up_crisp.gmt', sep='\t')

In [73]:
gene_down_lib.to_csv('Output/gene_set_library_dn_crisp.gmt', sep='\t')

In [74]:
gene_up_lib.to_csv('Output/gene_set_library_up_crisp.gmt', sep='\t')

### Print Version Of Modules In Use

In [70]:
%load_ext version_information
%version_information numpy, scipy, matplotlib, pandas, seaborn, version_information

Software,Version
Python,3.5.2 64bit [GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]
IPython,5.1.0
OS,Darwin 14.5.0 x86_64 i386 64bit
numpy,1.11.2
scipy,0.18.1
matplotlib,1.5.3
pandas,0.19.1
seaborn,0.7.1
version_information,1.0.3
Tue Jan 03 11:09:13 2017 EST,Tue Jan 03 11:09:13 2017 EST
