# Serializing Reference Gene Sets to JSON

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

In [2]:
def serialize(df, name, how='columns'): open(f"docs/data/{name}.js", 'w').write(f"var {name} = {df.to_json(orient=how)};\n")

## I. Serialize Gene Ontology to JSON

In [3]:
go = pd.read_csv('../dingo/GO/biological_process.csv')
go.head()

Unnamed: 0,namespace,GO_ID,name,depth,genes
0,biological_process,GO:0000001,mitochondrion inheritance,5,set()
1,biological_process,GO:0000002,mitochondrial genome maintenance,5,"{'LONP1', 'PIF1', 'TWNK', 'TEFM', 'SLC25A33', ..."
2,biological_process,GO:0000003,reproduction,1,"{'GPX4', 'CRHBP', 'ANTXR1', 'SPIN2A', 'HSD17B4..."
3,biological_process,GO:0000011,vacuole inheritance,5,set()
4,biological_process,GO:0000012,single strand break repair,6,"{'APTX', 'TNP1', 'SIRT1', 'XRCC1', 'LIG4', 'TD..."


In [4]:
go = go[go.genes != 'set()']
go.head()

Unnamed: 0,namespace,GO_ID,name,depth,genes
1,biological_process,GO:0000002,mitochondrial genome maintenance,5,"{'LONP1', 'PIF1', 'TWNK', 'TEFM', 'SLC25A33', ..."
2,biological_process,GO:0000003,reproduction,1,"{'GPX4', 'CRHBP', 'ANTXR1', 'SPIN2A', 'HSD17B4..."
4,biological_process,GO:0000012,single strand break repair,6,"{'APTX', 'TNP1', 'SIRT1', 'XRCC1', 'LIG4', 'TD..."
6,biological_process,GO:0000018,regulation of DNA recombination,6,"{'FBH1', 'APLF', 'TBX21', 'HIST1H1C', 'SMARCAD..."
7,biological_process,GO:0000019,regulation of mitotic recombination,7,"{'MLH1', 'RAD50', 'ERCC3', 'BLM', 'TERF2', 'ZS..."


In [5]:
go.genes = go.genes.apply(lambda x: list(eval(x)))
go.head()

Unnamed: 0,namespace,GO_ID,name,depth,genes
1,biological_process,GO:0000002,mitochondrial genome maintenance,5,"[DNA2, LONP1, POLG2, SESN2, DNAJA3, OPA1, SLC2..."
2,biological_process,GO:0000003,reproduction,1,"[PRM2, PIWIL1, HSF2, TFCP2L1, HOXD9, CAD, TSPY..."
4,biological_process,GO:0000012,single strand break repair,6,"[LOC100133315, APTX, APLF, XRCC1, LIG4, TDP1, ..."
6,biological_process,GO:0000018,regulation of DNA recombination,6,"[TBX21, TNFSF4, FBH1, ATAD5, ERCC2, BCL6, NSD2..."
7,biological_process,GO:0000019,regulation of mitotic recombination,7,"[ZSCAN4, MLH1, BLM, ERCC3, TERF2, ERCC2, MRE11..."


In [6]:
go.index = go['name'].map(str) + ' (' + go['GO_ID'] + ')'
go.head()

Unnamed: 0,namespace,GO_ID,name,depth,genes
mitochondrial genome maintenance (GO:0000002),biological_process,GO:0000002,mitochondrial genome maintenance,5,"[DNA2, LONP1, POLG2, SESN2, DNAJA3, OPA1, SLC2..."
reproduction (GO:0000003),biological_process,GO:0000003,reproduction,1,"[PRM2, PIWIL1, HSF2, TFCP2L1, HOXD9, CAD, TSPY..."
single strand break repair (GO:0000012),biological_process,GO:0000012,single strand break repair,6,"[LOC100133315, APTX, APLF, XRCC1, LIG4, TDP1, ..."
regulation of DNA recombination (GO:0000018),biological_process,GO:0000018,regulation of DNA recombination,6,"[TBX21, TNFSF4, FBH1, ATAD5, ERCC2, BCL6, NSD2..."
regulation of mitotic recombination (GO:0000019),biological_process,GO:0000019,regulation of mitotic recombination,7,"[ZSCAN4, MLH1, BLM, ERCC3, TERF2, ERCC2, MRE11..."


In [7]:
go = go.genes.to_frame()
go.head()

Unnamed: 0,genes
mitochondrial genome maintenance (GO:0000002),"[DNA2, LONP1, POLG2, SESN2, DNAJA3, OPA1, SLC2..."
reproduction (GO:0000003),"[PRM2, PIWIL1, HSF2, TFCP2L1, HOXD9, CAD, TSPY..."
single strand break repair (GO:0000012),"[LOC100133315, APTX, APLF, XRCC1, LIG4, TDP1, ..."
regulation of DNA recombination (GO:0000018),"[TBX21, TNFSF4, FBH1, ATAD5, ERCC2, BCL6, NSD2..."
regulation of mitotic recombination (GO:0000019),"[ZSCAN4, MLH1, BLM, ERCC3, TERF2, ERCC2, MRE11..."


In [8]:
serialize(go['genes'], 'go_gene_sets')

## II. Serialize MSigDB Pathways to JSON

In [9]:
msigdb = pd.read_csv('./c2.cp.v6.1.symbols.gmt', header=None).rename(columns={0: 'genes'})
msigdb.head()

Unnamed: 0,genes
0,KEGG_GLYCOLYSIS_GLUCONEOGENESIS\thttp://www.br...
1,KEGG_CITRATE_CYCLE_TCA_CYCLE\thttp://www.broad...
2,KEGG_PENTOSE_PHOSPHATE_PATHWAY\thttp://www.bro...
3,KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS\...
4,KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM\thttp://w...


In [10]:
msigdb.index = msigdb['genes'].str.split('\t', 1).str[0]
msigdb.head()

Unnamed: 0_level_0,genes
genes,Unnamed: 1_level_1
KEGG_GLYCOLYSIS_GLUCONEOGENESIS,KEGG_GLYCOLYSIS_GLUCONEOGENESIS\thttp://www.br...
KEGG_CITRATE_CYCLE_TCA_CYCLE,KEGG_CITRATE_CYCLE_TCA_CYCLE\thttp://www.broad...
KEGG_PENTOSE_PHOSPHATE_PATHWAY,KEGG_PENTOSE_PHOSPHATE_PATHWAY\thttp://www.bro...
KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS,KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS\...
KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM,KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM\thttp://w...


In [11]:
msigdb = msigdb.rename_axis(None)
msigdb['genes'] = msigdb['genes'].str.split('\t', 1).str[1]
msigdb['genes'] = msigdb['genes'].str.split('\t', 1).str[1].str.split('\t')
msigdb.head()

Unnamed: 0,genes
KEGG_GLYCOLYSIS_GLUCONEOGENESIS,"[ACSS2, GCK, PGK2, PGK1, PDHB, PDHA1, PDHA2, P..."
KEGG_CITRATE_CYCLE_TCA_CYCLE,"[IDH3B, DLST, PCK2, CS, PDHB, PCK1, PDHA1, LOC..."
KEGG_PENTOSE_PHOSPHATE_PATHWAY,"[RPE, RPIA, PGM2, PGLS, PRPS2, FBP2, PFKM, PFK..."
KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS,"[UGT1A10, UGT1A8, RPE, UGT1A7, UGT1A6, UGT2B28..."
KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM,"[MPI, PMM2, PMM1, FBP2, PFKM, GMDS, PFKFB4, PF..."


In [12]:
msigdb.index = msigdb.index.str.split('_', 1).str[1].str.replace('_', ' ').str.title().map(str) + ' (' + msigdb.index.str.split('_', 1).str[0] + ')'
msigdb.head()

Unnamed: 0,genes
Glycolysis Gluconeogenesis (KEGG),"[ACSS2, GCK, PGK2, PGK1, PDHB, PDHA1, PDHA2, P..."
Citrate Cycle Tca Cycle (KEGG),"[IDH3B, DLST, PCK2, CS, PDHB, PCK1, PDHA1, LOC..."
Pentose Phosphate Pathway (KEGG),"[RPE, RPIA, PGM2, PGLS, PRPS2, FBP2, PFKM, PFK..."
Pentose And Glucuronate Interconversions (KEGG),"[UGT1A10, UGT1A8, RPE, UGT1A7, UGT1A6, UGT2B28..."
Fructose And Mannose Metabolism (KEGG),"[MPI, PMM2, PMM1, FBP2, PFKM, GMDS, PFKFB4, PF..."


In [13]:
serialize(msigdb['genes'], 'msigdb_gene_sets')

In [14]:
serialize(pd.concat([go, msigdb])['genes'], 'human_gene_sets')

## III. Serialize Mouse Gene Ontology to JSON

In [20]:
go = pd.read_csv('../dingo/GO/mouse_biological_process.csv')
go.head()

Unnamed: 0,namespace,GO_ID,name,depth,genes
0,biological_process,GO:0000001,mitochondrion inheritance,5,set()
1,biological_process,GO:0000002,mitochondrial genome maintenance,5,"{'Gimap3', 'Slc25a33', 'Rrm2b', 'Lig3', 'Mpv17..."
2,biological_process,GO:0000003,reproduction,1,"{'Ccnf', 'Syt8', 'Taf4', 'Gm960', 'Spata22', '..."
3,biological_process,GO:0000011,vacuole inheritance,5,{'Rbsn'}
4,biological_process,GO:0000012,single strand break repair,6,"{'Aplf', 'Lig4', 'Tdp1', 'Smc2', 'Sirt1', 'Xrc..."


In [21]:
go = go[go.genes != 'set()']
go.head()

Unnamed: 0,namespace,GO_ID,name,depth,genes
1,biological_process,GO:0000002,mitochondrial genome maintenance,5,"{'Gimap3', 'Slc25a33', 'Rrm2b', 'Lig3', 'Mpv17..."
2,biological_process,GO:0000003,reproduction,1,"{'Ccnf', 'Syt8', 'Taf4', 'Gm960', 'Spata22', '..."
3,biological_process,GO:0000011,vacuole inheritance,5,{'Rbsn'}
4,biological_process,GO:0000012,single strand break repair,6,"{'Aplf', 'Lig4', 'Tdp1', 'Smc2', 'Sirt1', 'Xrc..."
6,biological_process,GO:0000018,regulation of DNA recombination,6,"{'Il4', 'Stat6', 'Helb', 'Ptprc', 'Tbx21', 'Pm..."


In [22]:
go.genes = go.genes.apply(lambda x: list(eval(x)))
go.head()

Unnamed: 0,namespace,GO_ID,name,depth,genes
1,biological_process,GO:0000002,mitochondrial genome maintenance,5,"[Slc25a33, Parp1, Dnaja3, Tk2, Lig3, Pif1, Mrp..."
2,biological_process,GO:0000003,reproduction,1,"[Hoxa9, Meig1, Bik, Mir742, Ccnyl1, Fignl1, Ts..."
3,biological_process,GO:0000011,vacuole inheritance,5,[Rbsn]
4,biological_process,GO:0000012,single strand break repair,6,"[Trpc2, Lig4, Smc2, Tdp1, Aplf, Xrcc1, Sirt1, ..."
6,biological_process,GO:0000018,regulation of DNA recombination,6,"[Ung, Rad50, Atad5, Lig3, Ube2b, Fignl1, Nsd2,..."


In [23]:
go.index = go['name'].map(str) + ' (' + go['GO_ID'] + ')'
go.head()

Unnamed: 0,namespace,GO_ID,name,depth,genes
mitochondrial genome maintenance (GO:0000002),biological_process,GO:0000002,mitochondrial genome maintenance,5,"[Slc25a33, Parp1, Dnaja3, Tk2, Lig3, Pif1, Mrp..."
reproduction (GO:0000003),biological_process,GO:0000003,reproduction,1,"[Hoxa9, Meig1, Bik, Mir742, Ccnyl1, Fignl1, Ts..."
vacuole inheritance (GO:0000011),biological_process,GO:0000011,vacuole inheritance,5,[Rbsn]
single strand break repair (GO:0000012),biological_process,GO:0000012,single strand break repair,6,"[Trpc2, Lig4, Smc2, Tdp1, Aplf, Xrcc1, Sirt1, ..."
regulation of DNA recombination (GO:0000018),biological_process,GO:0000018,regulation of DNA recombination,6,"[Ung, Rad50, Atad5, Lig3, Ube2b, Fignl1, Nsd2,..."


In [24]:
go = go.genes.to_frame()
go.head()

Unnamed: 0,genes
mitochondrial genome maintenance (GO:0000002),"[Slc25a33, Parp1, Dnaja3, Tk2, Lig3, Pif1, Mrp..."
reproduction (GO:0000003),"[Hoxa9, Meig1, Bik, Mir742, Ccnyl1, Fignl1, Ts..."
vacuole inheritance (GO:0000011),[Rbsn]
single strand break repair (GO:0000012),"[Trpc2, Lig4, Smc2, Tdp1, Aplf, Xrcc1, Sirt1, ..."
regulation of DNA recombination (GO:0000018),"[Ung, Rad50, Atad5, Lig3, Ube2b, Fignl1, Nsd2,..."


In [25]:
serialize(go['genes'], 'mouse_gene_sets')