In [1]:
import pandas as pd
import networkx as nx
import numpy as np 
import glob
import natsort
import math
import copy
import os

In [2]:
### FUNCTIONS
# Directed Jaccard index
def dir_Jaccard(li1, li2):
    '''takes lists of tuples'''
    intersection = set.intersection(set(li1), set(li2))
    return(len(intersection)/len(li1))

### Import files

In [3]:
# Set directories
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
input_path = os.path.join(parent_dir, "_ModelsAndNetworks")

In [4]:
Edge_filenames = natsort.natsorted(glob.glob(input_path + "/*Edgelist.txt"))
Species = ['_'.join(j.split('.')[0].split('_')[:-1]) for j in [i.split('\\')[-1] for i in Edge_filenames]]

Met_adjs = []
Met_edges = []
for i in range(len(Species)):
    M_edge = pd.read_table(Edge_filenames[i], header = None)
    M_edge.columns = ['source', 'target']
    Met_edges.append(M_edge)
    M_network = nx.from_pandas_edgelist(M_edge, create_using=nx.DiGraph)
    M_ad = nx.to_pandas_adjacency(M_network)
    Met_adjs.append(M_ad)

Species

['Actinomyces_odontolyticus_ATCC_17982',
 'Alistipes_putredinis_DSM_17216',
 'Anaerococcus_hydrogenalis_DSM_7454',
 'Anaerofustis_stercorihominis_DSM_17244',
 'Anaerostipes_caccae_DSM_14662',
 'Anaerotruncus_colihominis_DSM_17241',
 'Bacteroides_caccae_ATCC_43185',
 'Bacteroides_cellulosilyticus_DSM_14838',
 'Bacteroides_coprophilus_DSM_18228',
 'Bacteroides_dorei_DSM_17855']

### Filter the networks

In [5]:
# Find and remove glycans
glc_Met_adjs = []
glc_Met_IDs = []
for adj in Met_adjs:
    
    ID = list(adj.columns)
    G = nx.from_pandas_adjacency(adj, create_using= nx.DiGraph)

    glcNodes = [i for i in ID if 'MGlcn' in i]

    glc_N = copy.deepcopy(G)
    glc_N.remove_nodes_from(glcNodes)

    glc_adj = nx.to_pandas_adjacency(glc_N)
    glc_ID = list(glc_adj.columns)

    glc_Met_adjs.append(glc_adj)
    glc_Met_IDs.append(glc_ID)
    
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Find and remove currency metabolites
perc = 0.03

curr_Met_adjs = []
curr_Met_IDs = []
for adj0 in glc_Met_adjs:
    ID = list(adj0.columns)
    top_n_nodes = math.ceil(perc*len(ID))

    glc_G = nx.from_pandas_adjacency(adj0, create_using= nx.DiGraph)
    SortedNodes_Degree = sorted(dict(glc_G.degree(glc_G.nodes())).items(), key=lambda x:x[1], reverse=True)
    SortedNodes = [i[0] for i in SortedNodes_Degree]

    #remove
    curr_N = copy.deepcopy(glc_G)
    curr_N.remove_nodes_from(SortedNodes[:top_n_nodes])

    curr_adj = nx.to_pandas_adjacency(curr_N)
    curr_ID = list(curr_adj.columns)

    curr_Met_adjs.append(curr_adj)
    curr_Met_IDs.append(curr_ID)

### Export filtered adjacency networks

In [6]:
os.makedirs(f'{current_dir}/0_FilteredAdjacency', exist_ok=True)
for m, fil_adj in enumerate(curr_Met_adjs):
    fil_adj.to_csv(f'{current_dir}/0_FilteredAdjacency/{Species[m]}_FilteredNetwork.csv')

### Find all edges and nodes (degree > 0) of the filtered networks

In [7]:
e_tups = {Species[k]:item[item == 1].stack().index.tolist() for k,item in enumerate(curr_Met_adjs)}
n_tups = {k:set([item for sublist in v for item in sublist]) for k,v in e_tups.items()}

In [8]:
os.makedirs(f'{current_dir}/1_SpeciesMetaLayers', exist_ok=True)
os.makedirs(f'{current_dir}/2_Indices', exist_ok=True)

### Export the Edge layer and indices

In [9]:
layer_Edge = pd.DataFrame([v for k,v in e_tups.items()], index = Species).T
layer_Edge.to_csv(f'{current_dir}/1_SpeciesMetaLayers/Layer_Edge.csv')

edge_similarity_df = pd.DataFrame(columns=Species, index=Species)
for s1, e1 in e_tups.items():
    for s2, e2 in e_tups.items():
        edge_similarity_df.loc[s1, s2] = dir_Jaccard(e1, e2)
edge_similarity_df.to_csv(f'{current_dir}/2_Indices/Edge_CI_Edge_Curr{perc}.csv')

### Export the N layer and indices

In [10]:
layer_N = pd.DataFrame([v for k,v in n_tups.items()], index = Species).T
layer_N.to_csv(f'{current_dir}/1_SpeciesMetaLayers/Layer_N.csv')

node_similarity_df = pd.DataFrame(columns=Species, index=Species)
for s1, n1 in n_tups.items():
    for s2, n2 in n_tups.items():
        node_similarity_df.loc[s1, s2] = dir_Jaccard(n1, n2)
node_similarity_df.to_csv(f'{current_dir}/2_Indices/Network_CI_NN_Curr{perc}.csv')