# Extraction of gene set collections from Human-GEM

In [None]:
import cobra
import os
import numpy as np

In [None]:
# load model (note: may take 2-3 minutes to load)
model = cobra.io.load_yaml_model(os.path.join('..', 'models', 'Human-GEM.yml'))

## Subsystem-gene associations

In [None]:
# retrieve gene associations for all subsystems
subsystems = np.unique([x.subsystem for x in model.reactions])
gene_assoc = []
for s in subsystems:
    genes = [set(x.genes) for x in model.reactions if s in x.subsystem]
    genes = list(set.union(*genes))
    gene_assoc.append([x.id for x in genes])

In [None]:
# remove subsystems with no gene associations
subsystems = [x for i,x in enumerate(subsystems) if len(gene_assoc[i]) > 0]
gene_assoc = [x for x in gene_assoc if len(x) > 0]

In [None]:
# write subsystem-gene associations to .gmt file
# Note that the second column in a .gmt file is a description field that we are not using ('NA').
merged_list = ['\t'.join([subsystems[i]] + ['na'] + gene_assoc[i]) + '\n' for i in range(len(subsystems))]
with open(os.path.join('..', 'data', 'gene_set_collections', 'HumanGEM_subsystem_GSC.gmt'), 'w') as f:
    f.writelines(merged_list)

## Metabolite-gene associations

### Option 1: Including compartment
Metabolites with identical name but different cellular location (compartment) will be treated as different metabolites

In [None]:
# combine metabolite names with their compartment abbreviation
metabolites = [x.name + '[' + x.compartment + ']' for x in model.metabolites]

In [None]:
# retrieve gene associations for all metabolites
gene_assoc = []
for m in model.metabolites:
    genes = [set(r.genes) for r in list(m.reactions)]
    genes = list(set.union(*genes))
    gene_assoc.append([x.id for x in genes])

### Option 2: Excluding compartment
Metabolites that have the same name but different cellular location (compartment) will be merged

In [None]:
# ignore compartments
metabolites = np.unique([x.name for x in model.metabolites])

In [None]:
# retrieve gene associations for all metabolites
gene_assoc = []
for met_name in metabolites:
    reactions = [set(m.reactions) for m in model.metabolites if m.name == met_name]
    reactions = list(set.union(*reactions))
    genes = [set(r.genes) for r in reactions]
    genes = list(set.union(*genes))
    gene_assoc.append([x.id for x in genes])

### Process and write to file

In [None]:
# remove metabolites with no gene associations
metabolites = [x for i,x in enumerate(metabolites) if len(gene_assoc[i]) > 0]
gene_assoc = [x for x in gene_assoc if len(x) > 0]

In [None]:
# some metabolites contain an apostrophe, which can disrupt parsing by some packages
metabolites = [x.replace("'", "") for x in metabolites]

In [None]:
# write metabolite-gene associations to .gmt file
# Note that the second column in a .gmt file is a description field that we are not using ('NA').
merged_list = ['\t'.join([metabolites[i]] + ['na'] + gene_assoc[i]) + '\n' for i in range(len(metabolites))]
with open(os.path.join('..', 'data', 'gene_set_collections', 'HumanGEM_metabolite_GSC.gmt'), 'w') as f:
    f.writelines(merged_list)