### TG-GATES 
#### Source : https://datadryad.org/stash/dataset/doi:10.5061/dryad.pvmcvdngd
#### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import os
import glob as glob

import pandas as pd
import numpy as np

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [2]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

Gathering sources: 100%|██████████| 3/3 [00:09<00:00,  3.09s/it]


In [3]:
output_name = 'TG_GATES'
path = 'output'
if not os.path.exists(path):
    os.makedirs(path)

### Import all Gene Level Data

In [4]:
df_data = pd.DataFrame()
filepath = 'doi_10.5061_dryad.pvmcvdngd__v10/*GeneLevelData.txt'
filelist = sorted(glob.iglob(filepath))

for file in filelist:
    df = pd.read_table(file,engine='python')
    df = df[df['Prefilter P-Value'] < 0.05]
    df['Analysis'] = df['Analysis'].str.split('_rma').str[0]+'_rat'
    # Replace invalid characters
    df['Analysis'] = df['Analysis'].str.replace(' ','_')
    df['Analysis'] = df['Analysis'].str.replace('+','plus')
    df['Analysis'] = df['Analysis'].str.replace('�','_alpha')
    df['Analysis'] = df['Analysis'].str.replace('?','')
    #
    df_data = pd.concat([df_data, df[df['Max Fold Change Absolute Value'] >= 1.5]])

### Map Gene Symbols to Approved Entrez Gene Symbols

In [5]:
df_data = uf.map_symbols(df_data[['Analysis', 'Genes Symbols']].set_index('Genes Symbols'), symbol_lookup, remove_duplicates=True)
df_data.shape

  from pandas import Panel
100%|██████████| 810817/810817 [00:01<00:00, 517531.05it/s]


(534688, 1)

### Create outputs for Harmonizome and Enrichr
- Binary Matrix
- Gene & Attribute Lists
- Gene & Attribute Set Libraries
- Gene & Attribute Similarity Matrices
- Edge List

In [6]:
binary_matrix = uf.binary_matrix(df_data)
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

In [7]:
gene_list = uf.gene_list(binary_matrix, geneid_lookup)
uf.save_data(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

100%|██████████| 12138/12138 [00:00<00:00, 318933.90it/s]


In [8]:
attribute_list = uf.attribute_list(binary_matrix)
uf.save_data(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

In [9]:
uf.save_setlib(binary_matrix, 'gene', 'up', path, output_name + '_gene_set')
uf.save_setlib(binary_matrix, 'attribute', 'up', path, 
                           output_name + '_attribute_set')

100%|██████████| 1190/1190 [00:00<00:00, 3084.93it/s]
100%|██████████| 12138/12138 [00:00<00:00, 15908.01it/s]


In [10]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [11]:
gene_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
uf.save_data(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

In [12]:
edge_list = uf.edge_list(binary_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

The number of statisticaly relevent gene-attribute associations is: 534688
