In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import obonet
import BOCC
import pandas as pd
import numpy as np
import math
from webweb import Web
import os
import seaborn as sns
from BOCC import BOCC, load_clusters
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
import typing
import matplotlib.patches as mpatches
from upsetplot import from_memberships
from upsetplot import plot
import random
import pickle


std_fontsize = 12

In [None]:
G19 = nx.read_edgelist('../Edgelists/String_HPO_2019.phenotypic_branch.edgelist.txt')
G20 = nx.read_edgelist('../Edgelists/String_HPO_2020.phenotypic_branch.edgelist.txt')
G21 = nx.read_edgelist('../Edgelists/String_HPO_2021.phenotypic_branch.edgelist.txt')
G22 = nx.read_edgelist('../Edgelists/String_HPO_2022.phenotypic_branch.edgelist.txt')

def remove_trivial_coms(coms):
    keepers = []
    for c in coms:
        if len(c.members) > 2 and len(c.get_genes()) > 0 and len(c.get_genes()) != len(c.members):
            keepers.append(c)
    return keepers

g_subcoms = load_clusters('../SubComs/2021/paris.greedy.2021.coms.txt')
w_subcoms = load_clusters('../SubComs/2021/paris.walktrap.2021.coms.txt')
i_subcoms = load_clusters('../SubComs/2021/paris.infomap.2021.coms.txt')
c_subcoms = load_clusters('../SubComs/2021/paris.cesna.2021.coms.txt')

In [None]:
def get_possible_pairs_from_com(coms,G):
    # this function returns a list of all possible pairings of genes and phenotypes in all clusters - regardless of if they exist or not
    pairs = []
    for com in coms:
        pairs.append(set())
        genes = [x for x in com.members if 'HP:' not in x]
        hpos = [x for x in com.members if 'HP:' in x]
#         print('HPOs',len(hpos))
#         print('Genes',len(genes))
        for g in genes:
            for h in hpos:
                p=[g,h]
                # check if p in in G
                if G.has_edge(p[0],p[1]):
                    continue
                p.sort()
                pairs[-1].add(str(p))
    return pairs

def get_possible_pairs_from_com_non_bocc(com,G):
    # this function returns a list of all possible pairings of genes and phenotypes in all clusters - regardless of if they exist or not
    pairs = set()
    genes = [x for x in com if 'HP:' not in x]
    hpos = [x for x in com if 'HP:' in x]
#         print('HPOs',len(hpos))
#         print('Genes',len(genes))
    for g in genes:
        for h in hpos:
            p=[g,h]
            # check if p in in G
            if G.has_edge(p[0],p[1]):
                continue
            p.sort()
            pairs.add(str(p))
    return pairs

def load_new_edges(el,G):
    pairs = set()
    for line in open(el,'r'):
        row = line.strip().split('\t')
        row.sort()
        if G.has_edge(row[0],row[1]):
            continue
        pairs.add(str(row))
    return pairs

# load new edges but shuffle them
def load_new_edges_shuffled(el, seed=None):
    pairs = set()
    genes = []
    hpos = []
    rows = 0
    for line in open(el,'r'):
        row = line.strip().split('\t')
        # add the HPO term to the list of HPO terms
        if 'HP:' in row[0]:
            hpos.append(row[0])
            genes.append(row[1])
        else:
            genes.append(row[0])
            hpos.append(row[1])
        rows += 1
    # shuffle the genes and hpos
    np.random.seed(seed)
    np.random.shuffle(genes)
    np.random.shuffle(hpos)
    # create the pairs
    for i in range(len(genes)):
        p = [genes[i],hpos[i]]
        p.sort()
        pairs.add(str(p))
    return pairs
    

def rediscover(pairs, el_pairs):
    # find the intersection
    results = []
    for com_pairs in pairs:
        results.append(len(com_pairs.intersection(el_pairs)))
    return results

In [None]:
s0 = load_new_edges_shuffled(el='../g2p_Edgelists/String_HPO_2020.phenotypic_branch.g2p_edgelist.txt',seed=0)
s1 = load_new_edges_shuffled(el='../g2p_Edgelists/String_HPO_2020.phenotypic_branch.g2p_edgelist.txt',seed=1)

In [None]:
# get the possible pairs
g_pairs = get_possible_pairs_from_com(g_subcoms,G19)
print('Greedy',len(g_pairs))

# load the new edges
el_pairs = load_new_edges(el='../g2p_Edgelists/String_HPO_2020.phenotypic_branch.g2p_edgelist.txt', G=G19)
print('2020',len(el_pairs))

# score the cluster's rediscoveries
rediscoveries = rediscover(g_pairs, el_pairs)
print('Greedy',len(rediscoveries))

# do 10 shuffled rediscoveries
shuffled_rediscoveries = []
for i in range(1000):
    s = load_new_edges_shuffled(el='../g2p_Edgelists/String_HPO_2020.phenotypic_branch.g2p_edgelist.txt',seed=i)
    shuffled_rediscoveries.append(rediscover(g_pairs, s))

# compare the rediscoveries to the shuffled rediscoveries
p_values = []
for i in range(len(rediscoveries)):
    print('Cluster',i)
    this_coms_shuffled_rediscoveries = [x[i] for x in shuffled_rediscoveries]
    # empirical p-value for number of times rediscovery is greater than or equal to the shuffled rediscovery
    p = 1 - (sum([1 for x in this_coms_shuffled_rediscoveries if rediscoveries[i] >= x])/len(this_coms_shuffled_rediscoveries))
    p_values.append(p)

In [None]:
# plot a histogram of the p-values
plt.hist(p_values,bins=100)
plt.xlabel('p-value')
plt.ylabel('Count')
plt.title('Greedy')
plt.yscale('log')
plt.savefig('Greedy.png',dpi=300)


In [None]:
cluster_scores = {}
first= True
for line in open('../XGBoost2021ClusterRankings.csv','r'):
    if first:
        first = False
        continue
#     print(line)
    row = line.strip().split(',')
    cluster_scores[row[0]] = 1 - float(row[1])
print(list(cluster_scores.keys())[0])

sub_com_names = ['paris.greedy.2021:{}'.format(x.name) for x in g_subcoms]
print(sub_com_names)

In [None]:
# sub set p valuesa and sub com names to only include those that are in the cluster_scores dictionary
p_values_filtered = []
sub_com_names_filtered = []
for i in range(len(p_values)):
    if sub_com_names[i] in cluster_scores:
        p_values_filtered.append(p_values[i])
        sub_com_names_filtered.append(sub_com_names[i])
# make a dataframe of the p-values and cluster scores and cluster names
df = pd.DataFrame({'p-value':p_values_filtered,'cluster_score': [cluster_scores[x] for x in sub_com_names_filtered],'cluster_name':sub_com_names_filtered})

# plot the p-values vs the cluster scores
# plt.scatter(df['cluster_score'],df['p-value'],s=4,alpha=.2)
# plt.xlabel('Cluster Score')
# plt.ylabel('p-value')
# plt.title('Greedy')
# plt.savefig('greedy.scatter.png',dpi=300)

# make the same scatter plot but plot density on axers above and to the right of it
# fig, ax = plt.subplots(2,2)
# same as the line above but with width and height ratios
fig, ax = plt.subplots(2,2,gridspec_kw={'width_ratios':[3,1],'height_ratios':[1,3]})
fig.set_size_inches(10,10)
ax[1,0].scatter(df['cluster_score'],df['p-value'],s=4,alpha=.2)
ax[0,0].hist(df['cluster_score'],bins=100)
ax[1,1].hist(df['p-value'],bins=100,orientation='horizontal')
# ax[0,1].set_yscale('log')
# ax[1,0].set_xscale('log')
ax[1,0].set_xlabel('Cluster Score',fontsize=std_fontsize)
ax[1,0].set_ylabel('p-value',fontsize=std_fontsize)
ax[0,0].set_ylabel('Count',fontsize=std_fontsize)
ax[1,1].set_xlabel('Count',fontsize=std_fontsize)
ax[1,1].set_xscale('log')
ax[0,1].set_visible(False)
# remove top and right spines
ax[0,0].spines['top'].set_visible(False)
ax[0,0].spines['right'].set_visible(False)
ax[1,0].spines['top'].set_visible(False)
ax[1,0].spines['right'].set_visible(False)
ax[1,1].spines['top'].set_visible(False)
ax[1,1].spines['right'].set_visible(False)
ax[0,1].spines['top'].set_visible(False)
ax[0,1].spines['right'].set_visible(False)
plt.tight_layout()
plt.savefig('../Figures/greedy.snowball_v_edge_shuffle.scatter.density.png',dpi=300)



# Random Clusters


In [None]:
# for each cluster, get the number of members
random.seed(0)
# check if cluster_rediscoveries.pickle exists
if os.path.exists('cluster_rediscoveries.pickle'):
    # load the pickle file
    with open('cluster_rediscoveries.pickle','rb') as f:
        cluster_rediscoveries = pickle.load(f)
else:
    cluster_rediscoveries = []
    for j in range(1000):
        if j % 100 == 0:
            print(j)
        cluster_rediscoveries.append([])
        for i in range(len(g_subcoms)):
            # print('Cluster',i)
            first_resample = True
            # print(i)
            size = len(g_subcoms[i].members)
            if size < 3:
                cluster_rediscoveries[-1].append(rediscoveries)
                continue
            # from G19 pick size number of nodes at random, if there nodes do not have a phenotype, then re pick
            heterozygous = False
            while not heterozygous:
                # the line above but make sure it samples without replacement
                nodes = random.choices(list(G19.nodes),k=size)
                has_hpo = any([ 'HP:' in x for x in nodes])
                has_gene = any([ 'HP:' not in x for x in nodes])
                heterozygous = has_hpo and has_gene
                # if not heterozygous:
                #     if first_resample:
                #         first_resample = False
                    # else:
                        # print('re sampling')
                        # print(nodes)
            random_pairs = get_possible_pairs_from_com_non_bocc(nodes, G19)
            rediscoveries = len(random_pairs.intersection(el_pairs))
            cluster_rediscoveries[j].append(rediscoveries)

    for i in range(len(cluster_rediscoveries)):
        print(i, len(cluster_rediscoveries[i]))
    with open('cluster_rediscoveries.pickle','wb') as f:
        pickle.dump(cluster_rediscoveries,f)

In [None]:
# pickle cluster_rediscoveries

with open('cluster_rediscoveries.pickle','rb') as f:
        cluster_rediscoveries = pickle.load(f)
# make pvalues of cluster_rediscoveries vs rediscoveries
rediscovery_p_values = []
for i in range(len(rediscoveries)):
    print(i)
    this_coms_cluster_rediscoveries = [x[i] for x in cluster_rediscoveries]
    # empirical p-value for number of times rediscovery is greater than or equal to the shuffled rediscovery
    p = 1 - (sum([1 for x in this_coms_cluster_rediscoveries if rediscoveries[i] >= x])/len(this_coms_cluster_rediscoveries))
    rediscovery_p_values.append(p)

