# Project


#### For this project we will start by importing the necessary libraries 

In [None]:
#!pip install pyvis

In [None]:
import pandas as pd
import numpy as np

import networkx as nx
from networkx.algorithms import bipartite 

import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx
from matplotlib.colors import ListedColormap
import seaborn as sns
from pyvis.network import Network 

In [None]:
#pip install decorator==4.3

In [None]:
import warnings
warnings.filterwarnings('ignore')

##### Now we will import the csv files to start working with the data

In [None]:
#First we need to get the data from the csv file, made into a data frame
attributes= pd.read_csv('Data/nodeattribute.csv', delimiter=';')
edgelist= pd.read_csv('Data/edgelist.csv',delimiter=';')
g_attributes= attributes[attributes['0']=='gene']
d_attributes = attributes[attributes['0']=='disease']


#We need to convert the data frame into a dictionary to use the set_node_attributes 
nodes_attr = attributes.set_index('Id').to_dict(orient='index')
g_nodes_attr = g_attributes.set_index('Id').to_dict(orient='index')
d_nodes_attr= d_attributes.set_index('Id').to_dict(orient='index')


#Made it into a list
g_nodes= g_attributes['Id'].to_list()
d_nodes=d_attributes['Id'].to_list()
edges= edgelist.values.tolist()


G = nx.Graph()
G.add_nodes_from(g_nodes, bipartite=0)
G.add_nodes_from(d_nodes, bipartite=1)
nx.set_node_attributes(G, nodes_attr)
G.add_edges_from(edges)

print(G.nodes[1285]['0'])

#bottom_nodes, top_nodes = bipartite.sets(G)

In [None]:
#Checking that the code worked properly
G.nodes()
#G.edges()
#G.nodes[55]["Label"]
#nodes_attr"""

G.nodes[117]['bipartite']

In order to start working with the graph we need to make sure it is connected 

In [None]:
print(nx.is_connected(G))
print(nx.is_bipartite(G))


### Project description
1. Basic network description of your data (what type of network it is, what does it represent, is it real or synthetically generated, etc). In practice, the result of project phase #1 (finding data).

It is a unipartite, undirected and unweighted network that describes the associations between human diseases and human genes, as extracted from the Morbid Map (MM) of the Online Mendelian Inheritance in Man (OMIM) in 2005.

Source : https://github.com/gephi/gephi/wiki/Datasets


2. Basic network statistics of your data (number of nodes, edges, clustering, degree distribution, etc). In practice, the result of project phase #2 (exploratory data analysis)

In [None]:
#Graph properties 
"""
nx.is_directed(G)
nx.is_bipartite(G)
ns.is_weighted(G)

"""

#the number of nodes and egdes
num_nodes = len(G.nodes())
num_genes= len(g_nodes)
num_diseases= len(d_nodes) 
num_edges= len(G.edges())

#Printing the num of nodes and edges
print(f'The number of nodes that are genes is: {num_genes}\nThe number of nodes that are diseases is: {num_diseases}\n---\nThe total number of nodes is: {num_nodes} ')
print(f'\n---\nThe number of edges is: {num_edges}')



In [None]:
def make_df(column2, function, graph):
    df = pd.DataFrame(list(function(graph).items()), columns=['Id', column2])
    df.set_index('Id', inplace=True)

    return df

closeness = make_df('closeness', nx.closeness_centrality, G)
degree_centrality = make_df('degree', nx.degree_centrality, G)

df= degree_centrality.join(closeness, on='Id')
#density 
density = nx.density(G)



#### Clustering

In [None]:
#global clustering coefficient
print(nx.transitivity(G))

#average clustering coefficient
print(nx.average_clustering(G))

#local clustering coefficient
#print(nx.clustering(G))

#### Degree Distribution

In [None]:
degree_freq = nx.degree_histogram(G)
fig, ax = plt.subplots()

sns.set_theme(style='whitegrid')
plt.yscale('log')
ax.hist(degree_freq);
ax.set_xlabel('Degree');
ax.set_ylabel('Frequency');
ax.set_title('Degree Distribution');

In [None]:
degree_freq = [x/sum(degree_freq) for x in degree_freq] #normalized degree frequency
degree_range = range(len(degree_freq)) 

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Degree Distribution');
ax.set_xlabel('Degree');
ax.set_ylabel('Fraction of Nodes with Degree >= k');

deg = [0]*len(degree_freq)
sum_of_all = sum(degree_freq)
for i in range(len(deg)):
    for j in range(i,len(deg)):
        deg[i] += degree_freq[j]
    deg[i] = deg[i]/sum_of_all

ax.loglog(degree_range, deg);



### Graph representation

In [None]:
print(nx.is_directed(G))
print(nx.is_bipartite(G))
print(nx.is_weighted(G))
label= nx.get_node_attributes(G,'Label')
classification = nx.get_node_attributes(G,'0')
typeofclassification= nx.get_node_attributes(G,'1')

#classification[1329]
#typeofclassification[1329]

#G.nodes()
#nodes_attr

In [None]:
def draw_graph(G):   
    fig, ax = plt.subplots(figsize=(150,100))
    color_map = ['#7c0a02' if G.nodes[node]['bipartite'] == 0 else '#f4c2c2' for node in G]

    nx.draw(G, node_color = color_map, with_labels = True)

    plt.title('Graph representation', fontsize = 100);

# Visualizing the network using Pyvis

In [None]:
def make_interactive(attributes, edges):
    trial = attributes.set_index('Id')
    n = attributes['Id'].to_list()
    t= attributes['Label'].to_list()
    l= attributes['0'].to_list()
    l1 = attributes['1'].to_list()
    attributes['color'] = np.where(attributes['0']== 'disease', '#7c0a02' , '#f4c2c2')
    c = attributes['color'].to_list()


    net = Network('100vh', '100vw')

    net.add_nodes(n, title=l, label=t, color=c)
    net.add_edges(edges)
    net.show('jk.html')

In [None]:
draw_graph(G)

In [None]:
Gp = bipartite.weighted_projected_graph(G, d_nodes)
draw_graph(Gp)

In [None]:

t= pd.DataFrame(list(Gp.edges(data=True)), columns=['src', 'trg', 'Weight'])
#list(Gp.nodes(data=True))
t['Weight'] = [int(k['weight']) for k in t['Weight']]

t.to_csv(sep='\t', path_or_buf='Data/new_file',header=True)



In [None]:
# the input needs to be a file separated by tabs, can be generated from pd dataframe
import backboning
table, nnodes, nnedges = backboning.read("Data/new_file", "Weight", triangular_input=True)
nc_table = backboning.noise_corrected(table)
nc_backbone = backboning.thresholding(nc_table, 2)
nc_backbone

In [None]:
list(Gp.nodes(data=True))

In [None]:
temp = d_attributes
categories = np.unique(d_attributes['1'])
temp['1'] = pd.Categorical(temp['1'])

pos = nx.spring_layout(Gp)
colorlegend = {a:b  for b,a in enumerate(categories)}

colors = ['#800000', '#FF0000', '#FF7F50', '#FF8C00', '#FFD700',
 '#EEE8AA', '#BDB76B', '#808000', '#7CFC00', '#006400', 
 '#98FB98', '#20B2AA', '#2F4F4F', '#00FFFF', '#AFEEEE',
 '#6495ED','#191970','#8A2BE2','#4B0082','#DDA0DD',
 '#A0522D','#DEB887']
colors = ListedColormap(colors)

val_map = temp['1'].cat.codes.to_dict()
values = [val_map.get(node, 0) for node in Gp.nodes()]
cNorm  = colors.Normalize(vmin=0, vmax=max(values))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=colors)

f = plt.figure(1, figsize=(30, 30))
ax = f.add_subplot(1,1,1)
for label in colorlegend:
    ax.plot([0],[0],color=scalarMap.to_rgba(colorlegend[label]),label=label)

nx.draw(Gp, pos,  with_labels = True, node_color = temp['1'].cat.codes, cmap = colors, vmin = 0, vmax = max(values))

plt.legend();


## Assortativity of the projected network
We use the function $trace(M)-sum(M^2))/(1-sum(M^2))$, where M is the joint probability distribution (mixing matrix) of the specified attribute.
or in easier to understand math
$$r = \dfrac{\sum_ie_{ii} - \sum_ia_ib_i}{1-\sum_ia_ib_i}$$

In [None]:
nx.attribute_assortativity_coefficient(Gp, "1")

A score of 0.44 means we can say it is moderately assortative since it is above 0 on a [-1;1] scale. 

## Clustering

In [None]:
#global clustering coefficient
print(nx.transitivity(Gp))

#average clustering coefficient
print(nx.average_clustering(Gp))

#local clustering coefficient
print(nx.clustering(Gp))

In [None]:
from collections import defaultdict

def homophily_per_group(graph):
    types_diseases = defaultdict(list)
    for node in graph.nodes():
        group = graph.nodes[node]['1']
        count = 0 
        num_neighbors= len(list(graph.neighbors(node)))
        for n_node in list(graph.neighbors(node)):
            n_group= graph.nodes[n_node]['1']
            
            if group == n_group: 
                count +=1
                
        types_diseases[group].append(count/num_neighbors)
        
    for key,val in types_diseases.items():
        value = sum(val) / len(val)
        types_diseases.update({key: value})


    return types_diseases


#homophily_per_group(bipar_weighted).items()



def list_degrees(graph): 
    l = list(graph.degree())
    l_sorted= sorted(l, key=lambda t: t[1], reverse=True)
    
    return l_sorted

degrees = pd.DataFrame(list_degrees(bipar_weighted), columns=['NodeID', 'Degree'])

plt.figure(1, figsize=(30, 30))
sns.histplot(data=degrees,  x='Degree')
plt.title('Degree Distribution', fontsize=50)
plt.xlabel('Nodes', size=20)
plt.ylabel('Degrees', size=20)

In [None]:
degrees.head(200)
bipar_weighted.nodes[45]['1']
homophily_per_group(bipar_weighted)
list(bipar_weighted.neighbors(68))
def group(graph):
    dict_nodes=defaultdict(list)
    num_nodes = defaultdict(int)
    for node in graph.nodes():
        group = graph.nodes[node]['1']
        dict_nodes[group].append(node)
        num_nodes[group] += 1
    
    return dict_nodes, num_diseases


group(bipar_weighted)

#barplot of nodes above th
#th at least 5% of the data -> 26
#68, 187, 902, 1307

bipar_weighted.nodes[1307]