In [1]:
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 100)

# To remove pandas copy warnings:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import community
import networkx as nx
import igraph as ig
from sklearn.metrics.cluster import normalized_mutual_info_score
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
%matplotlib inline
from IPython.display import Image

####  Give features to rProteins by Calculating the 
- ResidueDepth 
- SecStructure
- amino acid + biophys prop
- atomic interactions of aa
- interaction type
- modularity determined by community detection
- aa centrality
- RNA Interactions
- partition/domain?

In [2]:
SaCe_rPro_edges = pd.read_csv('../../Ring_Analysis/SC_rPro/SC_LSU_Protein_sup_AES_edges.txt', sep='\t')
SaCe_rPro_edges

Unnamed: 0,NodeId1,Interaction,NodeId2,Distance,Angle,Energy,Atom1,Atom2,Donor,Positive,Cation,Orientation
0,A:3:_:ARG,HBOND:MC_MC,A:207:_:VAL,2.883,28.068,17.0,N,O,A:3:_:ARG,,,
1,A:3:_:ARG,VDW:SC_MC,A:207:_:VAL,4.021,-999.900,6.0,CB,C,,,,
2,A:3:_:ARG,HBOND:SC_SC,A:208:_:ASP,2.734,21.088,17.0,NE,OD2,A:3:_:ARG,,,
3,A:4:_:VAL,VDW:SC_SC,A:8:_:GLN,3.573,-999.900,6.0,CG1,CB,,,,
4,A:4:_:VAL,VDW:SC_SC,A:9:_:ARG,3.456,-999.900,6.0,CG1,CG,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
10061,q:187:_:VAL,VDW:SC_SC,q:190:_:VAL,3.731,-999.900,6.0,CG1,CG2,,,,
10062,q:189:_:GLN,VDW:SC_MC,q:196:_:VAL,3.317,-999.900,6.0,OE1,C,,,,
10063,q:189:_:GLN,VDW:SC_MC,q:197:_:PHE,3.639,-999.900,6.0,OE1,C,,,,
10064,q:190:_:VAL,HBOND:MC_MC,q:197:_:PHE,3.079,34.952,17.0,O,N,q:197:_:PHE,,,


In [3]:
SaCe_rPro_nodes = pd.read_csv('../../Ring_Analysis/SC_rPro/SC_LSU_Protein_sup_AES_nodes.txt', sep='\t').drop(['pdbFileName'], axis=1)
name_chains = pd.read_csv('../../Ring_Analysis/SC_rPro/PDB_SaCe_LSU_rPro_chain_names.csv', names=['rProtein', 'Chain'])
SaCe_rPro_nodes = pd.merge(SaCe_rPro_nodes, name_chains, on='Chain')
SaCe_rPro_nodes

Unnamed: 0,NodeId,Chain,Position,Residue,Dssp,Degree,Bfactor_CA,x,y,z,Rapdf,Tap,Accessibility,rProtein
0,A:3:_:ARG,A,3,ARG,,3,44.53,-23.242,145.781,140.369,-58.079,-0.052,0.536,uL02
1,A:4:_:VAL,A,4,VAL,,2,36.89,-20.489,147.173,138.130,34.829,0.037,0.293,uL02
2,A:5:_:ILE,A,5,ILE,,5,33.84,-20.039,144.590,135.428,0.543,-1.155,0.041,uL02
3,A:6:_:ARG,A,6,ARG,H,2,39.48,-16.807,142.987,134.443,21.172,0.259,0.274,uL02
4,A:7:_:ASN,A,7,ASN,H,1,50.43,-16.438,144.840,131.244,-14.251,0.035,0.328,uL02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5554,q:190:_:VAL,q,190,VAL,E,4,108.51,-146.683,63.539,185.996,-142.626,0.394,-999.900,uL10
5555,q:191:_:TYR,q,191,TYR,E,6,111.78,-148.854,66.621,185.608,-117.467,0.044,-999.900,uL10
5556,q:192:_:ASP,q,192,ASP,E,1,231.62,-152.572,66.436,184.809,-16.067,0.492,-999.900,uL10
5557,q:196:_:VAL,q,196,VAL,E,1,133.47,-148.457,66.168,190.487,0.837,-0.690,-999.900,uL10


In [4]:
SaCe_rPro_nodes[(SaCe_rPro_nodes.x < -900) | (SaCe_rPro_nodes.y < -900)  | (SaCe_rPro_nodes.z < -900)]

Unnamed: 0,NodeId,Chain,Position,Residue,Dssp,Degree,Bfactor_CA,x,y,z,Rapdf,Tap,Accessibility,rProtein
5109,j:501:_:ZN,j,501,ZN,,17,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,aL37
5264,m:500:_:ZN,m,500,ZN,,20,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,aL40
5366,o:501:_:ZN,o,501,ZN,,18,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,uL33
5451,p:501:_:ZN,p,501,ZN,,16,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,aL43


Remove these 4 zincs from just the node list

In [5]:
SaCe_rPro_nodes = SaCe_rPro_nodes[(SaCe_rPro_nodes.x > -900) | (SaCe_rPro_nodes.y > -900)  | (SaCe_rPro_nodes.z > -900)]

In [6]:
SaCe_rPro_nodes

Unnamed: 0,NodeId,Chain,Position,Residue,Dssp,Degree,Bfactor_CA,x,y,z,Rapdf,Tap,Accessibility,rProtein
0,A:3:_:ARG,A,3,ARG,,3,44.53,-23.242,145.781,140.369,-58.079,-0.052,0.536,uL02
1,A:4:_:VAL,A,4,VAL,,2,36.89,-20.489,147.173,138.130,34.829,0.037,0.293,uL02
2,A:5:_:ILE,A,5,ILE,,5,33.84,-20.039,144.590,135.428,0.543,-1.155,0.041,uL02
3,A:6:_:ARG,A,6,ARG,H,2,39.48,-16.807,142.987,134.443,21.172,0.259,0.274,uL02
4,A:7:_:ASN,A,7,ASN,H,1,50.43,-16.438,144.840,131.244,-14.251,0.035,0.328,uL02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5554,q:190:_:VAL,q,190,VAL,E,4,108.51,-146.683,63.539,185.996,-142.626,0.394,-999.900,uL10
5555,q:191:_:TYR,q,191,TYR,E,6,111.78,-148.854,66.621,185.608,-117.467,0.044,-999.900,uL10
5556,q:192:_:ASP,q,192,ASP,E,1,231.62,-152.572,66.436,184.809,-16.067,0.492,-999.900,uL10
5557,q:196:_:VAL,q,196,VAL,E,1,133.47,-148.457,66.168,190.487,0.837,-0.690,-999.900,uL10


In [7]:
for rPro in set(SaCe_rPro_nodes['rProtein']):
    rPro_df = SaCe_rPro_nodes[SaCe_rPro_nodes['rProtein'] == rPro]

In [8]:
rPro_df

Unnamed: 0,NodeId,Chain,Position,Residue,Dssp,Degree,Bfactor_CA,x,y,z,Rapdf,Tap,Accessibility,rProtein
5265,n:1:_:MET,n,1,MET,,3,47.85,-35.516,104.338,87.619,8.235,-0.000,0.585,aL41
5266,n:2:_:ARG,n,2,ARG,,3,46.56,-32.192,105.450,86.056,-6.241,0.008,0.547,aL41
5267,n:3:_:ALA,n,3,ALA,H,2,40.87,-29.490,103.065,87.252,-71.282,-0.211,0.597,aL41
5268,n:4:_:LYS,n,4,LYS,H,2,51.80,-27.253,105.787,88.616,0.000,0.579,0.500,aL41
5269,n:5:_:TRP,n,5,TRP,H,5,48.81,-30.064,106.801,90.899,-76.107,-0.229,0.379,aL41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5285,n:21:_:ARG,n,21,ARG,H,2,45.48,-27.752,98.171,113.439,-53.301,-0.136,0.748,aL41
5286,n:22:_:ALA,n,22,ALA,H,4,61.82,-25.468,100.591,115.423,-65.234,0.092,0.411,aL41
5287,n:23:_:ARG,n,23,ARG,H,2,65.79,-28.257,101.393,117.855,-32.751,-0.114,0.763,aL41
5288,n:24:_:SER,n,24,SER,E,1,69.15,-27.344,98.048,119.412,-38.356,-0.497,0.600,aL41


In [9]:
def plot_nodes(df):    
    
    data = []
    
    for rPro in set(df['rProtein']):
        
        rPro_df = df[df['rProtein'] == rPro]
        data.append(
        go.Scatter3d(
            x = rPro_df['x'],
            y = rPro_df['y'],
            z = rPro_df['z'],
            text = 
                rPro_df['Residue']
                +' '+rPro_df['Dssp'],
            mode = 'markers',
            name = rPro
            )
        )

    layout = go.Layout(
        title = 'Saccharomyces cerevisae Nodes (Atoms) Colored by rProtein',
        showlegend = True
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [10]:
plot_nodes(SaCe_rPro_nodes)

In [12]:
G_SaCe_rPro = nx.read_graphml('../../Ring_Analysis/SC_rPro/SC_LSU_Protein_sup_AES_network.xml')

In [13]:
G_SaCe_rPro.nodes['n0']

{'Accessibility': 0.536,
 'Bfactor_CA': 44.53,
 'Chain': 'A',
 'Degree': 3.0,
 'Dssp': ' ',
 'NodeId': 'A:3:_:ARG',
 'Position': 3.0,
 'Rapdf': -58.079,
 'Residue': 'ARG',
 'Tap': -0.052,
 'name': 'A:3:_:ARG',
 'pdbFileName': 'SC_LSU_Protein_sup_AES.pdb#3.A',
 'x': -23.242,
 'y': 145.781,
 'z': 140.369}

In [14]:
G_SaCe_rPro.edges[('n0', 'n186', 0)]

{'Angle': 28.068,
 'Atom1': 'N',
 'Atom2': 'O',
 'Cation': 'None',
 'Distance': 2.883,
 'Donor': 'A:3:_:ARG',
 'Energy': 17.0,
 'Interaction': 'HBOND:MC_MC',
 'NodeId1': 'A:3:_:ARG',
 'NodeId2': 'A:207:_:VAL',
 'Orientation': 'None',
 'Positive': 'None'}

In [15]:
def plot_nodes_partitions(df):  
    
    data = []
    
    for partition_count in range(df['partition'].max()):
        
        partition_df = df[df['partition'] == partition_count]
        data.append(
        go.Scatter3d(
            x = partition_df['x'],
            y = partition_df['y'],
            z = partition_df['z'],
            text = 
                partition_df['Residue']
                +' '+partition_df['Dssp']
                +' '+partition_df['Chain']
                +' '+partition_df['rProtein'],
            mode = 'markers',
            name = 'partition'+str(partition_count)
            )
        )
        
    layout = go.Layout(
        title = 'Coloring SaCe rProteins by Community',
        showlegend = True
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [16]:
def plot_louvain(res, G, make_plot=True):
    partition = community.best_partition(G, resolution=res, weight='Energy')
    partition_df = pd.DataFrame.from_dict(partition, orient='index').reset_index()
    partition_df.rename(columns={0:'partition'}, inplace=True)
    SaCe_rPro_partition = SaCe_rPro_nodes.join(partition_df)
    SaCe_rPro_partition = SaCe_rPro_partition.drop(['index'], axis=1)
    print('Resolution:', res)
    print('Number of partitions:',len(set(partition.values())))
    print('Modularity:', community.modularity(partition, G))
    if make_plot == True:
        plot_nodes_partitions(SaCe_rPro_partition)
    return(partition, SaCe_rPro_partition)

### """The resolution limit of modularity is a well studied phenomenon that imposes a limit on the size of the smallest community one can obtain by modularity optimisation."""

Lambiotte, R., Delvenne, J.-C., & Barahona, M. (2008). Laplacian Dynamics and Multiscale Modular Structure in Networks, 1–29. https://doi.org/10.1109/TNSE.2015.2391998

In [17]:
louvain5, df = plot_louvain(5, G_SaCe_rPro)

Resolution: 5
Number of partitions: 194
Modularity: 0.8045209633506075


In [18]:
resolution = np.linspace(0.1, 5, num=50, endpoint=True, retstep=False, dtype=None)

In [19]:
def make_prtn_mod_res_df(resolution_list, G):
    modularity_list = []
    partition_list = []
    
    for res in resolution:
        partition = community.best_partition(G, resolution=res, weight='Energy')
        num_partitions = len(set(partition.values()))
        modularity = community.modularity(partition, G)
        modularity_list.append(modularity)
        partition_list.append(num_partitions)
    
    df = pd.DataFrame(
        {'Resolution':resolution_list,
         'Num_Partitions':partition_list,
         'Modularity':modularity_list})
    
    return(df)

In [20]:
prtn_mod_res_df = make_prtn_mod_res_df(resolution, G_SaCe_rPro)

In [21]:
# Create traces
trace0 = go.Scatter(
    x = prtn_mod_res_df['Resolution'],
    y = prtn_mod_res_df['Num_Partitions'],
    mode = 'lines',
    name = 'Partitions'
)
trace1 = go.Scatter(
    x = prtn_mod_res_df['Resolution'],
    y = prtn_mod_res_df['Modularity'],
    mode = 'lines',
    name = 'Modularity',
    yaxis='y2'
)


layout = go.Layout(
    title='Modularity and Parition Number vs. Louvain Resoution',
    xaxis=dict(
        title='Louvain Resolution'
    ),
    yaxis=dict(
        title='Number of Partitions'
    ),
    yaxis2=dict(
        title='Modularity',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    )
)

data = [trace0, trace1]
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
dendrogram5 = community.generate_dendrogram(G, louvain5, weight='Energy')

In [None]:
for level in range(len(dendrogram5) - 1) :
    print(level)

I spent too much time playing with the data generated by the louvain method that I ran out of time to really explore the walktrap method. Here are my quick results from the walktrap method.

In [None]:
ig_G = ig.Graph.Read_GraphML('./SC_LSU_Protein_sup_AES_network.xml')

In [None]:
def walktrap_output(stps):
    %timeit walktrap = ig.Graph.community_walktrap(ig_G, weights='Energy', steps=stps)
    time.sleep(1)
    print('Steps:', stps)
    print('Optimal count:', walktrap.optimal_count)
    print('Modularity:', ig_G.modularity(membership=walktrap.as_clustering()))
    return([e for l in walktrap.merges for e in l])

In [None]:
walktrap2 = walktrap_output(2)

In [None]:
walktrap4 = walktrap_output(4)

In [None]:
walktrap6 = walktrap_output(6)

In [None]:
walktrap8 = walktrap_output(8)

In [None]:
walktrap10 = walktrap_output(10)

In [None]:
normalized_mutual_info_score(walktrap2, walktrap4)

In [None]:
normalized_mutual_info_score(list(louvain5.values()), list(louvain25.values()))

The number of large subunut rProteins in Sacchromyces cerevisae is 43, but the louvain community algorithm is partioning them into a larger number of communities. Its hard to see unless interacting with the 3D plots, but nodes (residues) that are in the center of the rProteins are often being partitioned into multiple paritions, sometimes with other rProteins. I do not fully understand the scientific importance of this finding, but I think this is a very intersting observation which I plan to examine much further in the future.

The modularity within the two algorithms do not change by changing the steps, or resolution. But the modularity between the two algorithms is different. Louvain has a modularity of 0.81 while walktrap has a modularity of 0.91. The time for the walktrap algorithm to run varies between 500 ms and about 1 sec, while the louvain algorithm runs closer to 2 sec.

I chose the normalized mutual info score to compare the algorithms and their different metrics. I could not compare the two algorithms because the length of the lists differed, I could not figure out how to resolve this. Within the algorithms, the differing in the parameters did not change the metric below 1, which is odd because altering the parameters gives different partitoins in the louvain algorithm, so I expected the normalized metric to change. I need to explore this metric further because the output does not make sense.

# 3 Problem 3 (30 points)

###### Consider a bipartite network with two types of nodes, Male and Female (M and F). Suppose that a certain Sexually Transmitted Disease (STD) can be transmitted only from M to F (with transmission rate $ {\beta}_{M→F} $) and from F to M (with transmission rate $ {\beta}_{F→M} $). Write the equations of the corresponding SI model, assuming the degree-block approximation and that there is no correlation between the node degrees.

Assume distribution of degree k in M and F is the same

For a male of degree k:

$ \frac{di_{Mk}}{dt} = \beta(1−i_{Mk})k\theta_{Fk} $

For a female of degree k:

$ \frac{di_{Fk}}{dt} = \beta(1−i_{Fk})k\theta_{Mk} $