In [23]:
import numpy as np
import pandas as pd
import models.cv
from collections import defaultdict
import os

# Analyze sGO prevelance

In [13]:
d = np.load("../generated-data/features/ppc_yeast_common_sgo.npz")
F = d['F'].astype(bool)
terms = d['feature_labels']

df = pd.read_csv("../generated-data/task_yeast_smf_30")
bins = np.array(df['bin'])
ids = df['id']

counts_by_term = defaultdict(lambda: [0,0,0])

for i, gene_id in enumerate(ids):
    thebin = bins[i]

    gene_terms = [gt for gt in terms[F[gene_id,:]]]
        
    for gt in gene_terms:
        counts_by_term[gt][int(thebin)] += 1

#     if len(gene_terms) == 0:
#         counts_by_term['No Term'][int(thebin)] += 1
    
rows = []
for term in counts_by_term:
    cnts = np.array(counts_by_term[term])
    normed_cnts = cnts / np.sum(cnts)

    rows.append({
            "term" : term,
            "lethal" : cnts[0],
            "sick" : cnts[1],
            "healthy" : cnts[2],
            "lethal_p" : normed_cnts[0],
            "sick_p" : normed_cnts[1],
            "healthy_p" : normed_cnts[2],
            "total" : np.sum(cnts),
            "diff" : (np.max(normed_cnts) - np.min(normed_cnts))
    })


df = pd.DataFrame(rows).set_index('term')

Unnamed: 0_level_0,lethal,sick,healthy,lethal_p,sick_p,healthy_p,total,diff
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GO:0016798,0,1,33,0.0,0.029412,0.970588,34,0.970588
GO:0016791,5,13,60,0.064103,0.166667,0.769231,78,0.705128
GO:0005773,9,66,217,0.030822,0.226027,0.743151,292,0.712329
GO:0005886,30,43,198,0.110701,0.158672,0.730627,271,0.619926
GO:0005975,15,32,122,0.088757,0.189349,0.721893,169,0.633136
GO:0016491,19,44,155,0.087156,0.201835,0.711009,218,0.623853
GO:0022857,11,49,141,0.054726,0.243781,0.701493,201,0.646766
GO:0005783,92,60,292,0.207207,0.135135,0.657658,444,0.522523
GO:0016757,16,8,44,0.235294,0.117647,0.647059,68,0.529412
GO:0008168,10,20,52,0.121951,0.243902,0.634146,82,0.512195


In [35]:

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# get the go terms sorted by most prevelant in healthy class
df_healthy = df.sort_values('healthy_p', ascending=False)
df_lethal = df.sort_values('lethal_p', ascending=False)
df_random = df.sample(frac=0.1)

# specify target df
target_df = df_healthy
output_path = "../results/yeast_smf_go_exploration/mn_healthy_first_%d"

# eliminate top 20%
top_p = [0.2, 0.4, 0.6, 0.8, 1.0]

print(output_path)

for p in top_p:
    n = int(p * target_df.shape[0])
    terms = list(target_df.index[n:])
    print("Eliminating top %d" % n)
    
    spec = [
                        {
                            "name" : "topology",
                            "path" : "../generated-data/features/ppc_yeast_topology.npz",
                            "selected_features" : ["lid"]
                        },
                        {
                            "name" : "redundancy",
                            "selected_features" : ["pident"],
                            "path" : "../generated-data/features/ppc_yeast_redundancy.npz"
                        },
                        {
                            "name" : "go",
                            "path" : "../generated-data/features/ppc_yeast_common_sgo.npz",
                            "selected_features" : terms
                        }
                   ]
    if p == 1:
        spec = spec[:-1]
    
    models.cv.main("models.smf_ordinal", 
                   "cfgs/models/yeast_smf_orm.json", 
                   output_path % n, 
                   type="mn", 
                   spec=spec,
                   num_processes=20)

../results/yeast_smf_go_exploration/mn_healthy_first_%d
Eliminating top 9
{'spec': [{'name': 'topology', 'path': '../generated-data/features/ppc_yeast_topology.npz', 'selected_features': ['lid']}, {'name': 'redundancy', 'selected_features': ['pident'], 'path': '../generated-data/features/ppc_yeast_redundancy.npz'}, {'name': 'go', 'path': '../generated-data/features/ppc_yeast_common_sgo.npz', 'selected_features': ['GO:0008168', 'GO:0008233', 'GO:0016829', 'GO:0016301', 'GO:0000902', 'GO:0006457', 'GO:0006520', 'GO:0006091', 'GO:0005777', 'GO:0004518', 'GO:0006629', 'GO:0005794', 'GO:0030234', 'GO:0005739', 'GO:0003677', 'GO:0051186', 'GO:0008289', 'GO:0005856', 'GO:0016853', 'GO:0005634', 'GO:0042393', 'GO:0019899', 'GO:0051604', 'GO:0008092', 'GO:0005694', 'GO:0051082', 'GO:0006605', 'GO:0016887', 'GO:0016874', 'GO:0007005', 'GO:0003924', 'GO:0003723', 'GO:0008134', 'GO:0016779', 'GO:0005730', 'GO:0003735']}], 'scramble': False, 'optimizer': 'nadam', 'epochs': 10000, 'verbose': True, '