In [1]:
import pandas as pd
import numpy as np
import h5py
from tqdm import trange

  from ._conv import register_converters as _register_converters


In [39]:
name = 'GO_BP'
file_name = "GO_Biological_Process_2018.txt"
new_gslibs = h5py.File("two_gslibs.hdf5", "r+")

# Preprocessing 

Converting the text file into a dictionary in which each phenotype is a key and its associated genes are in a list as a value.

In [40]:
d = {} 
with open(file_name) as file:
    for line in file:
        lst = line.strip().split("\t")
        ph = lst[0]
        lst = lst[2:]
        d[ph] = lst

In [41]:
d

{'positive regulation of posttranscriptional gene silencing (GO:0060148)': ['FXR1',
  'ZFP36',
  'DHX9',
  'XPO5',
  'FMR1',
  'STAT3',
  'WTIP',
  'PUM2',
  'AJUBA',
  'PUM1',
  'LIMD1'],
 'regulation of cell cycle process (GO:0010564)': ['UVRAG',
  'SH3GLB1',
  'RMI2',
  'CAV2',
  'HMGA2',
  'PRPF40A',
  'ZFYVE26',
  'KLHL21',
  'CDC25C',
  'APC',
  'CDK5',
  'CDC16',
  'CCP110',
  'KLHL9',
  'CCDC8',
  'SLC25A33',
  'KIF20A',
  'CDK13',
  'PIK3C3',
  'KIF20B',
  'OBSL1',
  'CALM2',
  'CALM3',
  'CALM1',
  '',
  'L3MBTL1',
  'AHCTF1',
  'SDCCAG3',
  'CETN2',
  'AURKA',
  'PIK3R4',
  'SFRP1',
  'AURKC',
  'PDXP',
  'AURKB',
  'FBXO43',
  'KNSTRN',
  'RACGAP1',
  'GPER1',
  'FSD1',
  'AKT2',
  'E4F1',
  'RPRD1B',
  'ECT2',
  'RAB11FIP3',
  'TIPIN',
  'RAB11FIP4',
  'LINC00598',
  'NEUROD1',
  'BBS4',
  'YTHDF2',
  'PLK1',
  'CSNK2A2',
  'BORA',
  'CSNK2A1',
  'PLK3',
  'PRMT5',
  'DAPK3',
  'SPAG5',
  'PRC1',
  'MYO19',
  'RAB11A',
  'CDC42',
  'BIN1',
  'BCL2L1',
  'BIRC6',
  'MAD1L1'

Creating the reverse of the above dictionary, in which each gene is a key and its value is a list of its associated phenotypes.

In [42]:
gene_dict = {}
for k,v in d.items(): 
    for gene in v: 
        if gene not in gene_dict: 
            gene_dict[gene] = []
        gene_dict[gene].append(k)

In [43]:
gene_dict

{'FXR1': ['positive regulation of posttranscriptional gene silencing (GO:0060148)',
  'positive regulation of macromolecule biosynthetic process (GO:0010557)',
  'regulation of dendritic spine development (GO:0060998)',
  'nervous system development (GO:0007399)',
  'negative regulation of cellular protein metabolic process (GO:0032269)',
  'positive regulation of cellular protein metabolic process (GO:0032270)',
  'positive regulation of translation (GO:0045727)',
  'negative regulation of translation (GO:0017148)',
  'regulation of dendrite development (GO:0050773)',
  'negative regulation of cellular amide metabolic process (GO:0034249)',
  'regulation of dendrite morphogenesis (GO:0048814)',
  'negative regulation of gene expression (GO:0010629)',
  'anterograde axonal transport (GO:0008089)',
  'positive regulation of gene silencing by miRNA (GO:2000637)',
  'regulation of cell morphogenesis involved in differentiation (GO:0010769)',
  'negative regulation of cellular macromolecul

Create a list of functions and genes in the correct index order for the gene set matrix, which is a binary matrix with genes as rows and functions or phenotypes as columns. If the gene is associated with a phenotype, the corresponding cells receives a value of 1. Otherwise, the cell receives a value of 0.
It is possible to use a list of keys since Python dictionaries are unordered such that their order is dictated by when they were entered rather than their alphanumeric order.

In [44]:
functions = list(d.keys())
gslib_genes = list(gene_dict.keys())
gene_set = np.zeros((len(gslib_genes), len(functions)))

In [45]:
for row in range(len(gene_set)): 
    gene = gslib_genes[row] 
    for col in range(len(functions)):
        f = functions[col]
        if f in gene_dict[gene]: 
            gene_set[row][col] = 1

In [46]:
pd.DataFrame(gene_set)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5093,5094,5095,5096,5097,5098,5099,5100,5101,5102
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
print(len(gslib_genes))
print(len(functions))

14433
5103


In [48]:
# new_gslibs.create_dataset(name + "genes", data=pd.DataFrame(gslib_genes).astype("S"))

Create dictionaries to easily get the index associated with a particular gene or function for later computations.

In [49]:
gene_to_idx = {}
f_to_idx = {}
# to get index associated with a particular gene
for i in range(len(gslib_genes)): 
    gene_to_idx[gslib_genes[i]] = i
# to get index associated with particular function
for j in range(len(functions)):
    f_to_idx[functions[j]] = j

# Mouse gene set library 

To compute this new matrix (which is not a Pearson correlation matrix), I will compare each gene to every other gene in a given set, get the average correlation and save that in a matrix that has genes has rows and phenotypes as columns. 

Start by getting the correlation matrix so that the average correlation for each gene for each set is easier to compute, by just taking the average of the rest of the correlations found associated with the given set.

In [50]:
cor = np.corrcoef(gene_set)

In [51]:
# Should be a square matrix M x M in which M = # of genes
cor.shape
cor

array([[ 1.00000000e+00,  1.07957247e-01,  1.33663593e-01, ...,
        -1.03989805e-03, -1.03989805e-03, -1.03989805e-03],
       [ 1.07957247e-01,  1.00000000e+00,  1.04497057e-01, ...,
        -1.61482072e-03, -1.61482072e-03, -1.61482072e-03],
       [ 1.33663593e-01,  1.04497057e-01,  1.00000000e+00, ...,
        -1.84378652e-03, -1.84378652e-03, -1.84378652e-03],
       ...,
       [-1.03989805e-03, -1.61482072e-03, -1.84378652e-03, ...,
         1.00000000e+00, -1.96001568e-04, -1.96001568e-04],
       [-1.03989805e-03, -1.61482072e-03, -1.84378652e-03, ...,
        -1.96001568e-04,  1.00000000e+00,  1.00000000e+00],
       [-1.03989805e-03, -1.61482072e-03, -1.84378652e-03, ...,
        -1.96001568e-04,  1.00000000e+00,  1.00000000e+00]])

In [52]:
# Initialize mouse gene set library 
sim = np.zeros((len(gslib_genes), len(functions)))

In [53]:
# Convert to Pandas DataFrame to easily use 
# .iloc function, which allows row selection
cor = pd.DataFrame(cor)

Create dictionary to convert the list of genes associated with each function to their corresponding indices in the list of genes above. This would help simplify the process of taking the average of the other correlations for each set by using the .iloc function in Pandas that allows row selection.

An alternative could be to use the binary matrix to get the indices, and use the np.where(row == 1) function.

In [54]:
gene_indices = {}
for func in d:
    gene_indices[func] = [] 
    for gene in d[func]:
        gene_indices[func].append(gene_to_idx[gene])

In [55]:
gene_indices

{'positive regulation of posttranscriptional gene silencing (GO:0060148)': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10],
 'regulation of cell cycle process (GO:0010564)': [11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101],
 'angiotensin-activated signaling pathway (GO:0038166)': [102,
  103,
  104,
  105,
  106,
  107,
  108],
 'DNA-templated transcription, termination (GO:0006353)': [109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  1

Loop through the indices of functions. From gene_indices,
we can get a list of the indices of genes that are associated
with that particular set/function. We can start by selecting
a sub-correlation matrix with only the rows corresponding to
the indices chosen.

Loop through these indices. For each row index, sum its column of correlations
and subtract 1 (for the self-correlation) and divide by the 
number of indices minus 1 to get the mean correlation of this
particular gene to every other gene in this set. Those genes that are not associated with the phenotype will maintain their value of 0.

In [56]:
for j in trange(len(functions)):
    f = functions[j]
    indices = gene_indices[f] 
    n = len(indices)-1
    temp = cor.iloc[indices]
    for i in range(len(indices)):
        gene_idx = indices[i]
        gene_cor = temp.iloc[:,i]
        sim[gene_idx][j] = (sum(gene_cor)-1)/n

100%|██████████| 5103/5103 [00:47<00:00, 107.97it/s]


Check if any row sums to 0. If there is, then there is probably something wrong, because each gene is associated with at least one function.

In [57]:
for row in sim:
    if sum(row) == 0:
        print(row)

Save the mouse gene set library to HDF5 file format since it took a nontrivial amount of time to run.

In [58]:
print(sim.shape) 

(14433, 5103)


# Making predictions
In this section, I will be making predictions about the TCGA dataset based on the following formula: 

![Screen%20Shot%202020-06-29%20at%203.58.20%20PM.png](attachment:Screen%20Shot%202020-06-29%20at%203.58.20%20PM.png)

In other words, G is the TCGA correlation matrix and GF is the matrix of genes and phenotypes that have the mean correlations of a given gene to every other gene in a given set (the mouse gene set library). The goal is to be able to get the new gene set library GF' to make predictions about the TCGA dataset.

In [59]:
tcga = h5py.File("tcga.hdf5", "r+")
list(tcga.keys())

['correlation_matrix', 'data', 'meta']

In [60]:
corr = tcga['correlation_matrix']
data = tcga['data']
meta = tcga['meta']
genes = meta['genes']
eids = meta['ensembl_id']
mouse_gene_names = gslib_genes
phenotypes = functions 
mgsl = sim

Check if there are any genes in common between the mouse genes and the TCGA genes or ensembl ids. Turns out that the mouse genes are not shown as ensembl ids so we will be using TCGA gene names for comparisons.

In [61]:
phenotype_list = []
for p in phenotypes:
    phenotype_list.append(p)

In [62]:
gene_names = []
for g in genes:
    gene_names.append(str(g[0])[2:-1])

# Filling in new gene set library
![Screen%20Shot%202020-06-29%20at%203.58.20%20PM.png](attachment:Screen%20Shot%202020-06-29%20at%203.58.20%20PM.png)

For each gene in TCGA genes for each phenotype, sum the correlation with every other gene multiplied by its correlation to the mouse gene set function, and divide the total by the correlations of the genes to the gene set function. This can be calculated by creating a new mouse gene set library that had all of the TCGA genes as rows in order, so that we may perform a dot product for the numerator and a summation in the denominator. Since self-correlations had been set to 0 in the correlation matrix, we can still dot each row of the correlation matrix with each col of the gene set library matrix to get the numerator, and the self-correlation would negate its product. For the denominator, we can sum the gene set library row but subtract the entry for the current gene.

In [63]:
# Initialize new gene set library to contain TCGA genes as rows and mouse phenotypes as columns.
gene_lib = np.zeros((len(gene_names), len(phenotype_list)))

In [64]:
"""
Expanded mouse gene set library with the same number of genes as the TCGA gene set.
We should ignore the ~3400 mouse genes not found in the TCGA gene set since they won't 
be included in the above calculations.
"""

ex_mgsl = np.zeros((len(gene_names), len(phenotype_list)))

In [65]:
# TCGA gene to index dictionary to help fill in expanded mouse gene set library 
tcga_to_idx = {} 
for g_idx in range(len(gene_names)): 
    g = gene_names[g_idx]
    tcga_to_idx[g] = g_idx

In [66]:
"""
Loop through the current mouse gene names. If the mouse gene name is found in the tcga_to_idx
dictionary, we find its index according to the TCGA gene list and replace the ex_mgsl row of 
zeros with the row found in the previous mouse gene set library. All of the genes found in the
TCGA library but not in the mouse gene set library will be left as zero for phenotype correlations.
"""
for m in range(len(mouse_gene_names)): 
    mouse_gene = mouse_gene_names[m]
    if mouse_gene in tcga_to_idx:
        idx = tcga_to_idx[mouse_gene]
        ex_mgsl[idx] = mgsl[m]  # replace expanded mgsl row with the prev mgsl row of correlations

In [67]:
# Check that our method worked and that none of the phenotype cols sum to 0. 
sums = sum(ex_mgsl)
for s in sums:
    if s == 0: 
        print(s)

In [68]:
"""
We can compute the numerator part of the matrix by multiplying matrices together.
Use Numpy rather than go through matrix manually b/c np probably has some speedy
magic we don't know about.
"""

gslib = np.matmul(corr, ex_mgsl)

In [69]:
# Check to get a new matrix with TCGA genes as rows and phenotypes of columns
gslib.shape

(38550, 5103)

In [70]:
"""
To finish computing the gene set library we have to go through each of the entries and divide
by the sum of the correlations in that phenotype's set (minus the current gene's correlation).
We can speed up computations by just taking the sums of each phenotype column. As we loop 
through the genes for each phenotype, we can just subtract the current gene's correlation
from the phenotype's sum.
"""

pheno_sums = []
for col in np.transpose(gslib):
    pheno_sums.append(sum(col))

for i in trange(len(gslib)):
    for j in range(len(phenotype_list)):
        sub = ex_mgsl[i][j]
        denom = pheno_sums[j]
        gslib[i][j] = gslib[i][j]/(denom-sub)

100%|██████████| 38550/38550 [09:19<00:00, 68.90it/s]


In [71]:
pd.DataFrame(gslib)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5093,5094,5095,5096,5097,5098,5099,5100,5101,5102
0,0.000846,0.000108,0.001481,0.001839,-0.001609,-0.000451,0.002233,0.002476,0.006937,0.000514,...,0.000957,0.000748,-0.001198,-0.001763,-0.007424,0.001050,1.224109e-03,0.003582,0.000445,-0.000751
1,-0.000335,-0.000501,0.000697,0.000076,-0.000659,-0.000671,0.001236,0.001144,-0.002860,-0.000165,...,-0.000112,0.000431,0.002027,-0.001064,-0.001631,-0.000146,2.905854e-04,0.000657,0.000216,0.003394
2,-0.000017,0.001014,-0.000578,-0.000064,0.000188,0.001752,-0.002467,0.000361,0.007648,0.001460,...,0.000055,0.000796,-0.000587,-0.001298,-0.006217,-0.000027,1.895790e-03,-0.000513,0.000692,-0.013840
3,0.001585,0.000275,-0.000448,-0.000861,0.001350,0.000863,-0.001881,-0.000713,-0.002522,0.001282,...,0.000722,-0.000320,-0.005950,-0.000282,-0.011941,-0.000761,7.237584e-04,0.000497,-0.000045,-0.004853
4,0.000789,0.002277,-0.001104,0.000461,0.001981,0.003009,-0.003354,-0.000628,-0.000355,0.001861,...,0.000119,0.001028,0.002024,-0.000510,-0.005176,-0.000093,2.332370e-03,-0.000856,0.000338,-0.016720
5,-0.001129,-0.000890,-0.000121,-0.000731,-0.000144,-0.000923,-0.000130,-0.000687,-0.005721,-0.000849,...,-0.000756,-0.001307,0.022894,0.001179,0.014365,-0.000492,-2.116440e-03,0.002953,0.000049,0.010583
6,0.000767,-0.000706,0.001769,0.000197,-0.001485,-0.001209,0.001970,0.001571,-0.010991,-0.000029,...,0.000665,-0.000294,0.016165,-0.001208,0.006115,0.000003,-9.570077e-04,0.008025,0.000296,0.010422
7,-0.001543,-0.000353,0.000331,0.000472,-0.001980,-0.000506,0.000437,0.000790,0.014474,-0.000505,...,-0.000629,0.000954,0.000126,-0.001252,-0.005190,0.000484,1.412612e-03,0.004012,0.000067,-0.002274
8,0.000714,0.000165,0.000116,0.000037,-0.000111,0.000912,-0.000961,0.000330,-0.003096,0.000808,...,0.000132,0.001009,-0.004519,-0.000787,-0.005416,-0.000048,6.650535e-04,0.002885,0.000631,-0.004677
9,0.002237,0.001402,-0.000854,-0.000452,0.002238,0.001653,-0.001794,-0.001144,-0.001222,0.001601,...,0.000604,0.000124,-0.000951,0.000955,-0.002782,-0.000404,1.135016e-03,-0.001289,0.000315,-0.009101


In [72]:
new_gslibs.create_dataset("tcga_" + name, data=gslib)

<HDF5 dataset "tcga_GO_BP": shape (38550, 5103), type "<f8">

In [73]:
new_gslibs.close()

In [35]:
def rank_genes(gene_idx, gslib, phenotypes, rank):
    tups = list(dict(zip(range(len(gslib[0])), gslib[gene_idx])).items())
    tups.sort(key = lambda t: t[1], reverse = True)
    top_rank = [ t[1] for t in tups[: rank] ]
    top_int = [ t[0] for t in tups[: rank] ]
    top_pheno = [ phenotypes[i] for i in top_int ]
    return top_rank, top_pheno

In [36]:
arr = np.zeros((len(gene_names), 50))
pheno_arr = []
for gene_idx in trange(len(gene_names)):  
    row, phenos = rank_genes(gene_idx, gslib, phenotype_list, 50)
    arr[gene_idx] = row 
    pheno_arr.append(phenos)

100%|██████████| 38550/38550 [00:12<00:00, 3149.02it/s]


In [37]:
arr

array([[ 0.29877131,  0.10541843,  0.08486299, ...,  0.00455943,
         0.00452437,  0.00445046],
       [ 0.22993448,  0.03429865,  0.01627328, ...,  0.0016877 ,
         0.00167302,  0.00166603],
       [ 0.12333791,  0.09353647,  0.0899232 , ...,  0.00250522,
         0.00241165,  0.00239534],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.        ]])

In [38]:
new_gslibs.create_dataset(name, data=arr)

<HDF5 dataset "kegg_human_tcga": shape (38550, 50), type "<f8">

In [39]:
pheno_arr = pd.DataFrame(pheno_arr).astype("S")
pheno_arr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,"b'Valine, leucine and isoleucine degradation'",b'Legionellosis',b'Steroid biosynthesis',b'Graft-versus-host disease',b'Pancreatic cancer',b'Arginine and proline metabolism',b'Viral myocarditis',b'Complement and coagulation cascades',b'Tuberculosis',b'Cortisol synthesis and secretion',...,b'Tight junction',b'Citrate cycle (TCA cycle)',b'SNARE interactions in vesicular transport',b'Dilated cardiomyopathy (DCM)',b'Systemic lupus erythematosus',b'Huntington disease',b'Asthma',b'Intestinal immune network for IgA production',b'Pentose phosphate pathway',b'Cardiac muscle contraction'
1,"b'Valine, leucine and isoleucine degradation'",b'Legionellosis',b'Pancreatic cancer',b'Graft-versus-host disease',b'Synthesis and degradation of ketone bodies',b'Arginine and proline metabolism',b'PPAR signaling pathway',b'Type II diabetes mellitus',b'Thiamine metabolism',b'Propanoate metabolism',...,b'Riboflavin metabolism',b'Starch and sucrose metabolism',b'Ubiquinone and other terpenoid-quinone biosy...,b'Folate biosynthesis',b'Tight junction',b'One carbon pool by folate',b'Phagosome',b'VEGF signaling pathway',b'Cocaine addiction',b'Fatty acid elongation'
2,b'Vascular smooth muscle contraction',b'Steroid biosynthesis',b'Homologous recombination',b'Graft-versus-host disease',b'Cholesterol metabolism',b'DNA replication',b'Viral myocarditis',b'Progesterone-mediated oocyte maturation',b'One carbon pool by folate',b'Base excision repair',...,b'Salmonella infection',b'RNA degradation',b'Tyrosine metabolism',b'mRNA surveillance pathway',b'Pentose phosphate pathway',b'Cysteine and methionine metabolism',b'Ferroptosis',b'Colorectal cancer',b'Cell cycle',b'Asthma'
3,"b'Phenylalanine, tyrosine and tryptophan biosy...",b'Graft-versus-host disease',"b'Valine, leucine and isoleucine degradation'",b'Cholesterol metabolism',b'Viral myocarditis',b'Glutathione metabolism',b'Vascular smooth muscle contraction',b'Steroid biosynthesis',b'Gastric acid secretion',b'Homologous recombination',...,b'Pancreatic secretion',b'Herpes simplex virus 1 infection',b'Asthma',b'Ribosome',b'Vitamin digestion and absorption',b'Salmonella infection',b'Fanconi anemia pathway',b'Phosphonate and phosphinate metabolism',b'Propanoate metabolism',b'Leishmaniasis'
4,"b'Phenylalanine, tyrosine and tryptophan biosy...",b'Vascular smooth muscle contraction',b'Homologous recombination',b'Cholesterol metabolism',b'Steroid biosynthesis',b'Graft-versus-host disease',b'Pyruvate metabolism',b'Progesterone-mediated oocyte maturation',b'DNA replication',b'One carbon pool by folate',...,b'NOD-like receptor signaling pathway',b'Colorectal cancer',b'Spliceosome',b'Ubiquitin mediated proteolysis',b'Fatty acid degradation',b'Basal transcription factors',b'GABAergic synapse',b'Aminoacyl-tRNA biosynthesis',b'Tuberculosis',b'Th17 cell differentiation'
5,"b'Phenylalanine, tyrosine and tryptophan biosy...",b'Pyruvate metabolism',b'Allograft rejection',b'Platelet activation',b'Acute myeloid leukemia',b'Thermogenesis',b'Th17 cell differentiation',b'C-type lectin receptor signaling pathway',b'Fc gamma R-mediated phagocytosis',b'Fatty acid biosynthesis',...,b'Toll-like receptor signaling pathway',b'Influenza A',b'Ribosome',b'Butanoate metabolism',b'Apoptosis',b'Human T-cell leukemia virus 1 infection',b'Toxoplasmosis',b'Th1 and Th2 cell differentiation',b'Caffeine metabolism',b'Autoimmune thyroid disease'
6,"b'Valine, leucine and isoleucine degradation'",b'Legionellosis',b'Steroid biosynthesis',b'Complement and coagulation cascades',b'Allograft rejection',b'Thermogenesis',b'Pertussis',b'PPAR signaling pathway',b'Gastric acid secretion',b'Cell adhesion molecules (CAMs)',...,b'Tight junction',b'Ubiquinone and other terpenoid-quinone biosy...,b'ABC transporters',b'Osteoclast differentiation',b'Sulfur metabolism',b'Riboflavin metabolism',"b'Alanine, aspartate and glutamate metabolism'",b'Type II diabetes mellitus',b'Adherens junction',b'Glycosphingolipid biosynthesis'
7,"b'Valine, leucine and isoleucine degradation'",b'Steroid biosynthesis',b'One carbon pool by folate',b'Legionellosis',b'Complement and coagulation cascades',b'Arginine and proline metabolism',b'Allograft rejection',b'Thiamine metabolism',b'Synthesis and degradation of ketone bodies',b'Vitamin digestion and absorption',...,b'Arrhythmogenic right ventricular cardiomyopa...,b'Inositol phosphate metabolism',b'Fructose and mannose metabolism',b'Peroxisome',b'TGF-beta signaling pathway',b'Starch and sucrose metabolism',b'Terpenoid backbone biosynthesis',b'Cardiac muscle contraction',b'Aldosterone synthesis and secretion',b'RIG-I-like receptor signaling pathway'
8,"b'Valine, leucine and isoleucine degradation'",b'Steroid biosynthesis',b'Graft-versus-host disease',b'Homologous recombination',b'Vascular smooth muscle contraction',b'One carbon pool by folate',b'Complement and coagulation cascades',b'alpha-Linolenic acid metabolism',b'Synthesis and degradation of ketone bodies',b'Gastric acid secretion',...,b'Terpenoid backbone biosynthesis',b'Citrate cycle (TCA cycle)',"b'Alanine, aspartate and glutamate metabolism'",b'Proteasome',b'Olfactory transduction',b'Peroxisome',b'Protein digestion and absorption',b'Salivary secretion',b'Viral myocarditis',b'AMPK signaling pathway'
9,"b'Phenylalanine, tyrosine and tryptophan biosy...",b'Cholesterol metabolism',b'Pyruvate metabolism',b'Vascular smooth muscle contraction',b'Homologous recombination',b'Graft-versus-host disease',b'Glutathione metabolism',b'Thermogenesis',b'Parkinson disease',b'Progesterone-mediated oocyte maturation',...,b'RNA polymerase',b'Linoleic acid metabolism',b'Gastric acid secretion',b'Herpes simplex virus 1 infection',"b'Glycine, serine and threonine metabolism'",b'Fatty acid degradation',b'T cell receptor signaling pathway',b'Autophagy',b'Primary bile acid biosynthesis',b'ErbB signaling pathway'


In [40]:
new_gslibs.create_dataset(name + "pheno", data=pheno_arr)

<HDF5 dataset "kegg_human_tcgapheno": shape (38550, 50), type "|S58">

In [41]:
new_gslibs.close()