# Serializing Reference Gene Sets to JSON

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

## I. Serialize Gene Ontology to JSON

In [2]:
go = pd.read_csv('../GONN/GO/biological_process.csv')
go.head()

Unnamed: 0,GeneSymbol,GO_ID,GO_term,Evidence
0,A1BG,GO:0002576,platelet degranulation,TAS
1,A1BG,GO:0008150,biological_process,ND
2,A1BG,GO:0043312,neutrophil degranulation,TAS
3,A2M,GO:0001869,"negative regulation of complement activation, ...",IDA
4,A2M,GO:0002576,platelet degranulation,TAS


In [3]:
go = (go.groupby(['GO_term', 'GO_ID'])['GeneSymbol'].apply(list)).to_frame()
go.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,GeneSymbol
GO_term,GO_ID,Unnamed: 2_level_1
'de novo' AMP biosynthetic process,GO:0044208,"[ADSL, ADSL, ADSS, ADSS, ADSSL1, ADSSL1]"
'de novo' CTP biosynthetic process,GO:0044210,"[CTPS1, CTPS2]"
'de novo' GDP-L-fucose biosynthetic process,GO:0042351,"[GMDS, GMDS, GMDS, TSTA3, TSTA3, TSTA3]"
'de novo' IMP biosynthetic process,GO:0006189,"[ADSL, ATIC, GART, PFAS, PFAS, PPAT, PAICS]"
'de novo' L-methionine biosynthetic process,GO:0071266,[CTH]


In [4]:
go.index = go.reset_index()['GO_term'].map(str) + ' (' + go.reset_index()['GO_ID'] + ')'
go.head()

Unnamed: 0,GeneSymbol
'de novo' AMP biosynthetic process (GO:0044208),"[ADSL, ADSL, ADSS, ADSS, ADSSL1, ADSSL1]"
'de novo' CTP biosynthetic process (GO:0044210),"[CTPS1, CTPS2]"
'de novo' GDP-L-fucose biosynthetic process (GO:0042351),"[GMDS, GMDS, GMDS, TSTA3, TSTA3, TSTA3]"
'de novo' IMP biosynthetic process (GO:0006189),"[ADSL, ATIC, GART, PFAS, PFAS, PPAT, PAICS]"
'de novo' L-methionine biosynthetic process (GO:0071266),[CTH]


In [5]:
go.to_json('/Users/alex/Documents/abcd1/json/go_biological_process_genes.json')

## II. Serialize MSigDB Pathways to JSON

In [5]:
msigdb = pd.read_csv('./c2.cp.v6.1.symbols.gmt', header=None).rename(columns={0: 'GeneSymbol'})
msigdb.head()

Unnamed: 0,GeneSymbol
0,KEGG_GLYCOLYSIS_GLUCONEOGENESIS\thttp://www.br...
1,KEGG_CITRATE_CYCLE_TCA_CYCLE\thttp://www.broad...
2,KEGG_PENTOSE_PHOSPHATE_PATHWAY\thttp://www.bro...
3,KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS\...
4,KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM\thttp://w...


In [6]:
msigdb.index = msigdb['GeneSymbol'].str.split('\t', 1).str[0]
msigdb.head()

Unnamed: 0_level_0,GeneSymbol
GeneSymbol,Unnamed: 1_level_1
KEGG_GLYCOLYSIS_GLUCONEOGENESIS,KEGG_GLYCOLYSIS_GLUCONEOGENESIS\thttp://www.br...
KEGG_CITRATE_CYCLE_TCA_CYCLE,KEGG_CITRATE_CYCLE_TCA_CYCLE\thttp://www.broad...
KEGG_PENTOSE_PHOSPHATE_PATHWAY,KEGG_PENTOSE_PHOSPHATE_PATHWAY\thttp://www.bro...
KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS,KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS\...
KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM,KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM\thttp://w...


In [7]:
msigdb = msigdb.rename_axis(None)
msigdb['GeneSymbol'] = msigdb['GeneSymbol'].str.split('\t', 1).str[1]
msigdb['GeneSymbol'] = msigdb['GeneSymbol'].str.split('\t', 1).str[1].str.split('\t')
msigdb.head()

Unnamed: 0,GeneSymbol
KEGG_GLYCOLYSIS_GLUCONEOGENESIS,"[ACSS2, GCK, PGK2, PGK1, PDHB, PDHA1, PDHA2, P..."
KEGG_CITRATE_CYCLE_TCA_CYCLE,"[IDH3B, DLST, PCK2, CS, PDHB, PCK1, PDHA1, LOC..."
KEGG_PENTOSE_PHOSPHATE_PATHWAY,"[RPE, RPIA, PGM2, PGLS, PRPS2, FBP2, PFKM, PFK..."
KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS,"[UGT1A10, UGT1A8, RPE, UGT1A7, UGT1A6, UGT2B28..."
KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM,"[MPI, PMM2, PMM1, FBP2, PFKM, GMDS, PFKFB4, PF..."


In [8]:
msigdb.index = msigdb.index.str.split('_', 1).str[1].str.replace('_', ' ').str.title().map(str) + ' (' + msigdb.index.str.split('_', 1).str[0] + ')'
msigdb.head()

Unnamed: 0,GeneSymbol
Glycolysis Gluconeogenesis (KEGG),"[ACSS2, GCK, PGK2, PGK1, PDHB, PDHA1, PDHA2, P..."
Citrate Cycle Tca Cycle (KEGG),"[IDH3B, DLST, PCK2, CS, PDHB, PCK1, PDHA1, LOC..."
Pentose Phosphate Pathway (KEGG),"[RPE, RPIA, PGM2, PGLS, PRPS2, FBP2, PFKM, PFK..."
Pentose And Glucuronate Interconversions (KEGG),"[UGT1A10, UGT1A8, RPE, UGT1A7, UGT1A6, UGT2B28..."
Fructose And Mannose Metabolism (KEGG),"[MPI, PMM2, PMM1, FBP2, PFKM, GMDS, PFKFB4, PF..."


In [10]:
msigdb['GeneSymbol'].to_json('/Users/alex/Documents/abcd1/json/msigdb_pathway_genes.json')

In [10]:
pd.concat([go, msigdb])['GeneSymbol'].to_pickle('./human_gene_sets.pickle')

In [11]:
pd.concat([go, msigdb])['GeneSymbol'].to_json('/Users/alex/Documents/abcd1/json/all_gene_sets.json')

## III. Serialize Mouse Gene Ontology to JSON

In [15]:
go = pd.read_csv('./mouse_go/mouse2go.csv')
go.head()

Unnamed: 0,tax_id,GeneSymbol,GO_ID,Evidence,GO_term,Category
0,10090,0610005C13Rik,GO:0003674,ND,molecular_function,Function
1,10090,0610005C13Rik,GO:0005575,ND,cellular_component,Component
2,10090,0610005C13Rik,GO:0008150,ND,biological_process,Process
3,10090,0610006L08Rik,GO:0003674,ND,molecular_function,Function
4,10090,0610006L08Rik,GO:0005575,ND,cellular_component,Component


In [16]:
go = go[go.Category == 'Process']

In [17]:
go = (go.groupby(['GO_term', 'GO_ID'])['GeneSymbol'].apply(list)).to_frame()
go.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,GeneSymbol
GO_term,GO_ID,Unnamed: 2_level_1
'de novo' AMP biosynthetic process,GO:0044208,"[Adsl, Adss, Adssl1]"
'de novo' GDP-L-fucose biosynthetic process,GO:0042351,[Tsta3]
'de novo' IMP biosynthetic process,GO:0006189,"[Atic, Gart, Paics, Pfas]"
'de novo' L-methionine biosynthetic process,GO:0071266,[Cth]
'de novo' NAD biosynthetic process from aspartate,GO:0034628,"[Nmnat1, Nmnat2, Nmnat3]"


In [18]:
go.index = go.reset_index()['GO_term'].map(str) + ' (' + go.reset_index()['GO_ID'] + ')'
go.head()

Unnamed: 0,GeneSymbol
'de novo' AMP biosynthetic process (GO:0044208),"[Adsl, Adss, Adssl1]"
'de novo' GDP-L-fucose biosynthetic process (GO:0042351),[Tsta3]
'de novo' IMP biosynthetic process (GO:0006189),"[Atic, Gart, Paics, Pfas]"
'de novo' L-methionine biosynthetic process (GO:0071266),[Cth]
'de novo' NAD biosynthetic process from aspartate (GO:0034628),"[Nmnat1, Nmnat2, Nmnat3]"


In [19]:
go.to_json('/Users/alex/Documents/abcd1/json/mouse_biological_process_genes.json')