# Filter out pathways (gene sets) from [GSEA](https://www.gsea-msigdb.org/gsea/msigdb/human/collections.jsp) we are not interested in

In [1]:
# Load packages
import numpy as np
from collections import OrderedDict
import json


In [2]:
# Import gene sets
path = '../../../data/raw/pathway_information/'
files = ['c2.cp.reactome.v2023.1.Hs.symbols.gmt', 'c3.all.v2023.1.Hs.symbols.gmt', 'c5.go.bp.v2023.1.Hs.symbols.gmt', 'c7.all.v2023.1.Hs.symbols.gmt']

# Filter out gene sets with more than 300 genes
min_num_genes = 0
max_num_genes = 300
pathways_dict = OrderedDict()
for filename in files:
    file = path + filename
    with open(file) as f:
        lines = f.readlines()
        counter = 0
        num_genes = []
        for line in lines:
            line = line.strip()
            val = line.split('\t')
            if min_num_genes <= len(val[2:]) <= max_num_genes: # val[2:] contains all genes in the current pathway
                pathways_dict[val[0]] = val[2:]
                counter += 1
                num_genes.append(len(val[2:]))

        print(f"{filename} resulted in {counter} pathways with a average number of {np.mean(num_genes):.2f} genes and std of {np.std(num_genes):.2f}")
        print()

print("Number of total pathways: ", len(pathways_dict))

c2.cp.reactome.v2023.1.Hs.symbols.gmt resulted in 1612 pathways with a average number of 41.94 genes and std of 49.63

c3.all.v2023.1.Hs.symbols.gmt resulted in 3041 pathways with a average number of 119.94 genes and std of 80.52

c5.go.bp.v2023.1.Hs.symbols.gmt resulted in 7263 pathways with a average number of 40.52 genes and std of 53.76

c7.all.v2023.1.Hs.symbols.gmt resulted in 5178 pathways with a average number of 185.78 genes and std of 39.76

Number of total pathways:  17094


In [3]:
# Save dictionary
with open('../../../data/processed/pathway_information/all_pathways.json', 'w') as json_file:
    json.dump(pathways_dict, json_file) 