In [1]:
import requests
import json
import re
from tqdm import tqdm
import os
import pandas as pd
import time
import math
from glob import glob
import uuid

import pyarrow.feather as feather
from tqdm import tqdm
from scipy.stats import rankdata, zscore
import csv

## Human

In [2]:
sn = "Homo_sapiens"
organism = "../Serialization/Mammalia/%s"%sn
url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/{}.gene_info.gz'.format(organism)
file = '{}.gene_info.tsv'.format(organism)


In [3]:
def fetch_save_read(url, file, reader=pd.read_csv, sep='\t', **kwargs):
  ''' Download file from {url}, save it to {file}, and subsequently read it with {reader} using pandas options on {**kwargs}.
  '''
  if not os.path.exists(file):
    if os.path.dirname(file):
      os.makedirs(os.path.dirname(file), exist_ok=True)
    df = reader(url, sep=sep, index_col=None)
    df.to_csv(file, sep=sep, index=False)
  return pd.read_csv(file, sep=sep, **kwargs)


In [4]:
ncbi_gene = fetch_save_read(url, file)
ncbi_gene = ncbi_gene.set_index("GeneID")
ncbi_gene.head()

Unnamed: 0_level_0,#tax_id,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,9606,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20220805,-
2,9606,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20221009,-
3,9606,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe...,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20221025,-
9,9606,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20220925,-
10,9606,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156...,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20221009,-


In [5]:
missing = set()
removed = {}
human_ids = set()
merged_gmt = {}
with open("data/PFOCR_Pathways_Human_2023.gmt") as f:
    with open("output/PFOCR_Pathways_Human_2023.gmt", "w") as o:
        csv_writer = csv.writer(o, delimiter="\t")
        for line in f:
            label, description, *genes = line.strip().split("\t")
            row = [label, description]
            for i in genes:
                gene_id = int(i)
                if gene_id in ncbi_gene.index:
                    gene = ncbi_gene.at[gene_id, "Symbol"]
                    row.append(gene)
                else:
                    missing.add(gene_id)
            if len(row) >= 7: # use only gene sets with at least 5 genes:
                human_ids.add(label)
                csv_writer.writerow(row)
                merged_gmt[label] = set(row[2:])
            else:
                removed[label] = len(row) - 2
len(missing), len(removed)

(6, 13770)

## Mouse

In [6]:
sn = "Mus_musculus"
organism = "../Serialization/Mammalia/%s"%sn
url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/{}.gene_info.gz'.format(organism.replace("../Serialization/", ""))
file = '{}.gene_info.tsv'.format(organism)


In [7]:
ncbi_gene = fetch_save_read(url, file)
ncbi_gene = ncbi_gene.set_index("GeneID")
ncbi_gene.head()

Unnamed: 0_level_0,#tax_id,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
11287,10090,Pzp,-,A1m|A2m|MAM,MGI:MGI:87854|Ensembl:ENSMUSG00000030359|Allia...,6,6 F3|6 63.02 cM,"PZP, alpha-2-macroglobulin like",protein-coding,Pzp,"PZP, alpha-2-macroglobulin like",O,pregnancy zone protein|alpha 1 macroglobulin|a...,20230412,-
11298,10090,Aanat,-,AA-NAT|Nat-2|Nat4|Snat,MGI:MGI:1328365|Ensembl:ENSMUSG00000020804|All...,11,11 81.43 cM|11 E2,arylalkylamine N-acetyltransferase,protein-coding,Aanat,arylalkylamine N-acetyltransferase,O,serotonin N-acetyltransferase|aralkylamine N-a...,20230518,-
11302,10090,Aatk,-,AATYK|aatyk1|mKIAA0641,MGI:MGI:1197518|Ensembl:ENSMUSG00000025375|All...,11,11|11 E2,apoptosis-associated tyrosine kinase,protein-coding,Aatk,apoptosis-associated tyrosine kinase,O,serine/threonine-protein kinase LMTK1|apoptosi...,20230412,-
11303,10090,Abca1,-,ABC-1|Abc1,MGI:MGI:99607|Ensembl:ENSMUSG00000015243|Allia...,4,4 B2|4 28.57 cM,"ATP-binding cassette, sub-family A (ABC1), mem...",protein-coding,Abca1,"ATP-binding cassette, sub-family A (ABC1), mem...",O,phospholipid-transporting ATPase ABCA1|ATP-bin...,20230518,-
11304,10090,Abca4,-,Abc10|Abcr|D430003I15Rik|RmP,MGI:MGI:109424|Ensembl:ENSMUSG00000028125|Alli...,3,3 G1|3 52.94 cM,"ATP-binding cassette, sub-family A (ABC1), mem...",protein-coding,Abca4,"ATP-binding cassette, sub-family A (ABC1), mem...",O,retinal-specific phospholipid-transporting ATP...,20230412,-


In [8]:
missing = set()
removed = {}
mouse_ids = set()
with open("data/PFOCR_Pathways_Mouse_2023.gmt") as f:
    with open("output/PFOCR_Pathways_Mouse_2023.gmt", "w") as o:
        csv_writer = csv.writer(o, delimiter="\t")
        for line in f:
            label, description, *genes = line.strip().split("\t")
            row = [label, description]
            for i in genes:
                gene_id = int(i)
                if gene_id in ncbi_gene.index:
                    gene = ncbi_gene.at[gene_id, "Symbol"]
                    row.append(gene.upper())
                else:
                    missing.add(gene_id)
            if len(row) >= 7: # use only gene sets with at least 5 genes:
                mouse_ids.add(label)
                csv_writer.writerow(row)
                if label not in merged_gmt:
                    merged_gmt[label] = set(row[2:])
                else:
                    merged_gmt[label] = merged_gmt[label].union(row[2:])
            else:
                removed[label] = len(row) - 2
len(missing), len(removed)

(21, 13468)

In [9]:
len(human_ids), len(mouse_ids), len(merged_gmt)

(21694, 21359, 21845)

In [10]:
with open("output/PFOCR_Pathways_2023.gmt", "w") as o:
    csv_writer = csv.writer(o, delimiter="\t")
    for k,v in merged_gmt.items():
        row = [k, ''] + list(v)
        csv_writer.writerow(row)