In [2]:
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 100)

# To remove pandas copy warnings (may need to turn on if writing new functions):
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from Bio.PDB import *
import community
import networkx as nx
import pygraphviz as pgv
import igraph as ig
from sklearn.metrics.cluster import normalized_mutual_info_score
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
%matplotlib inline
from IPython.display import Image

In [5]:
ThTh_edges = pd.read_csv('../Ring_Analysis/1VY4_rRNA_Phases/1VY4_rRNA_phases_rProtein_edges.txt', sep='\t')
ThTh_edges

Unnamed: 0,NodeId1,Interaction,NodeId2,Distance,Angle,Energy,Atom1,Atom2,Donor,Positive,Cation,Orientation
0,0:3001:_:MG,IAC:LIG_MC,0:21:_:LEU,5.220,-999.9,0.0,MG,O,,,,
1,0:3001:_:MG,IAC:LIG_SC,0:22:_:GLY,3.071,-999.9,0.0,MG,HA2,,,,
2,0:3001:_:MG,IAC:LIG_SC,0:23:_:VAL,1.977,-999.9,0.0,MG,H,,,,
3,0:3001:_:MG,IAC:LIG_MC,0:24:_:LYS,6.141,-999.9,0.0,MG,N,,,,
4,0:3001:_:MG,IAC:LIG_SC,0:26:_:TYR,4.767,-999.9,0.0,MG,HE1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
31410,f:2866:_:U,IAC:LIG_LIG,f:2869:_:G,6.756,-999.9,0.0,C5,O4',,,,
31411,f:2877:_:G,IAC:LIG_LIG,f:2880:_:C,6.711,-999.9,0.0,O6,N4,,,,
31412,f:2878:_:U,IAC:LIG_LIG,f:2881:_:C,6.821,-999.9,0.0,N3,N4,,,,
31413,f:2879:_:C,IAC:LIG_LIG,f:2882:_:A,3.487,-999.9,0.0,N3,N6,,,,


In [6]:
ThTh_nodes = pd.read_csv('../Ring_Analysis/1VY4_rRNA_Phases/1VY4_rRNA_phases_rProtein_nodes_xyz_modified.txt')
name_chains = pd.read_csv('../standards/1VY4_name_chains_rRNA_phases.csv', names=['Object', 'Chain'])
ThTh_nodes = pd.merge(ThTh_nodes, name_chains, on='Chain')
ThTh_nodes

Unnamed: 0,NodeId,Chain,Position,Residue,Dssp,Degree,Bfactor_CA,Rapdf,Tap,Accessibility,x,y,z,Object
0,a:2061:_:G,a,2061,G,,20,-999.90,-999.900,-999.900,-999.900,-41.375000,132.966003,167.074005,Phase1
1,a:2062:_:A,a,2062,A,,14,-999.90,-999.900,-999.900,-999.900,-39.689999,134.860992,160.151993,Phase1
2,a:2063:_:C,a,2063,C,,9,-999.90,-999.900,-999.900,-999.900,-41.956001,130.835007,155.792007,Phase1
3,a:2064:_:C,a,2064,C,,12,-999.90,-999.900,-999.900,-999.900,-47.741001,131.516006,153.901993,Phase1
4,a:2065:_:C,a,2065,C,,14,-999.90,-999.900,-999.900,-999.900,-51.830002,136.326996,153.990005,Phase1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6329,8:63:_:PRO,8,63,PRO,T,7,38.36,-11.299,0.091,0.296,-56.248001,181.615005,167.673004,bL35
6330,8:64:_:TYR,8,64,TYR,T,11,42.67,-33.017,-0.126,0.464,-55.664001,185.250000,166.755997,bL35
6331,8:65:_:GLU,8,65,GLU,,5,65.21,48.469,0.000,0.655,-58.987000,186.423996,165.337997,bL35
6332,8:5001:_:MG,8,5001,MG,,15,-999.90,-999.900,-999.900,-999.900,-64.934998,185.035004,155.401993,bL35


In [7]:
test = pd.read_csv('../Ring_Analysis/1VY4_rRNA_Phases/1VY4_rRNA_phases_rProtein_nodes_xyz_modified.txt')
test[test.Chain == 'a']

Unnamed: 0,NodeId,Chain,Position,Residue,Dssp,Degree,Bfactor_CA,Rapdf,Tap,Accessibility,x,y,z
0,a:2061:_:G,a,2061,G,,20,-999.9,-999.9,-999.9,-999.9,-41.375000,132.966003,167.074005
1,a:2062:_:A,a,2062,A,,14,-999.9,-999.9,-999.9,-999.9,-39.689999,134.860992,160.151993
2,a:2063:_:C,a,2063,C,,9,-999.9,-999.9,-999.9,-999.9,-41.956001,130.835007,155.792007
3,a:2064:_:C,a,2064,C,,12,-999.9,-999.9,-999.9,-999.9,-47.741001,131.516006,153.901993
4,a:2065:_:C,a,2065,C,,14,-999.9,-999.9,-999.9,-999.9,-51.830002,136.326996,153.990005
...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,a:2497:_:A,a,2497,A,,20,-999.9,-999.9,-999.9,-999.9,-64.786003,130.789993,163.917007
114,a:2498:_:C,a,2498,C,,17,-999.9,-999.9,-999.9,-999.9,-63.306999,131.401001,169.535004
115,a:2499:_:C,a,2499,C,,19,-999.9,-999.9,-999.9,-999.9,-58.441002,130.048996,173.419998
116,a:2500:_:U,a,2500,U,,21,-999.9,-999.9,-999.9,-999.9,-52.285000,130.130005,173.384995


In [8]:
name_chains

Unnamed: 0,Object,Chain
0,Phase1,a
1,Phase2,b
2,Phase3,c
3,Phase4,d
4,Phase5,e
...,...,...
30,bL36,9
31,bL25,Z
32,bL31,4
33,bL32,5


In [9]:
def plot_nodes(df):    
    
    data = []
    
    for rPro in set(df['Object']):
        
        rPro_df = df[df['Object'] == rPro]
        data.append(
        go.Scatter3d(
            x = rPro_df['x'],
            y = rPro_df['y'],
            z = rPro_df['z'],
            text = 
                rPro_df['Residue']
                +' '+rPro_df['Dssp'],
            mode = 'markers',
            name = rPro
            )
        )

    layout = go.Layout(
        title = 'Thermus thermophilus Nodes (Atoms) Colored by rProtein and rRNA Phase',
        showlegend = True
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [10]:
plot_nodes(ThTh_nodes)

Unofortunately, I only modified the txt files that I use to make the plots, the xml file still does not have an updated x, y, z. However, I still try and use the dataframe as often as possible

In [13]:
G_ThTh = nx.read_graphml('../Ring_Analysis/1VY4_rRNA_Phases/1VY4_rRNA_phases_rProtein_network.xml')

In [14]:
G_ThTh.nodes['n0']

{'Accessibility': -999.9,
 'Bfactor_CA': -999.9,
 'Degree': 20.0,
 'NodeId': 'a:2061:_:G',
 'Position': 2061.0,
 'Rapdf': -999.9,
 'Residue': 'G',
 'Tap': -999.9,
 'name': 'a:2061:_:G',
 'pdbFileName': '1VY4_rRNA_phases_rProtein.pdb#2061.a',
 'x': -999.9,
 'y': -999.9,
 'z': -999.9}

In [15]:
G_ThTh.edges[('n0', 'n3637', 0)]

{'Angle': -999.9,
 'Atom1': 'OP1',
 'Atom2': 'MG',
 'Cation': 'None',
 'Distance': 6.705,
 'Donor': 'None',
 'Energy': 0.0,
 'Interaction': 'IAC:LIG_LIG',
 'NodeId1': 'a:2061:_:G',
 'NodeId2': 'F:303:_:MG',
 'Orientation': 'None',
 'Positive': 'None'}

In [16]:
def print_top_bottom_5(metric):
    top5 = {key: metric[key] for key in sorted(metric, key=metric.get, reverse=True)[:5]}
    bottom5 = {key: metric[key] for key in sorted(metric, key=metric.get, reverse=False)[:5]}
    print('top5:')
    for x in top5:
        print(x, '\t', top5[x])
    print('bottom5:')
    for x in bottom5:
        print(x, '\t', bottom5[x])

In [17]:
def print_centrality(graph):
    degree = nx.degree_centrality(graph)
    #closeness = nx.closeness_centrality(graph) #takes a long time
    #harmonic = nx.harmonic_centrality(graph) #takes a long time
    #betweenness = nx.betweenness_centrality(graph) #takes a long time
    eigenvector = nx.eigenvector_centrality_numpy(graph)
    # pagerank_085 = nx.pagerank_numpy(graph, alpha=0.85) #takes a long time
    # Katz does not work on multigraph
    print('degree:')
    print_top_bottom_5(degree)
    #print('\ncloseness:')
    #print_top_bottom_5(closeness)
    #print('\nharmonic:')
    #print_top_bottom_5(harmonic)
    #print('\nbetweenness:')
    #print_top_bottom_5(betweenness)
    print('\neigenvector:')
    print_top_bottom_5(eigenvector)
    #print('\npagerank alpha=0.85:')
    #print_top_bottom_5(pagerank_085)

### Takes a while to run

In [18]:
print_centrality(G_ThTh)

degree:
top5:
n525 	 0.005842412758566241
n1883 	 0.0055266066635086064
n1254 	 0.005368703615979789
n2122 	 0.004894994473393336
n309 	 0.0047370914258645196
bottom5:
n646 	 0.00015790304752881732
n1107 	 0.00015790304752881732
n1591 	 0.00015790304752881732
n1643 	 0.00015790304752881732
n1644 	 0.00015790304752881732

eigenvector:
top5:
n448 	 0.133990981781
n530 	 0.128074683532
n529 	 0.128001265693
n309 	 0.120215516205
n449 	 0.120199632251
bottom5:
n5350 	 -4.35276418087e-18
n5152 	 -3.00898465991e-18
n5184 	 -2.95522561028e-18
n3831 	 -2.69253026602e-18
n6109 	 -2.26415560521e-18


In [34]:
def plot_nodes_partitions(df):  
    
    data = []
    
    for partition_count in range(df['partition'].max()):
        
        partition_df = df[df['partition'] == partition_count]
        data.append(
        go.Scatter3d(
            x = partition_df['x'],
            y = partition_df['y'],
            z = partition_df['z'],
            text = 
                partition_df['Residue']
                +' '+partition_df['Dssp']
                +' '+partition_df['Chain']
                +' '+partition_df['Object'],
            mode = 'markers',
            name = 'partition'+str(partition_count)
            )
        )
      
    data.append(
        go.Scatter3d(
            x = df['x'],
            y = df['y'],
            z = df['z'],
            mode = 'lines',
            hoverinfo='none'
        )
    )
    
    layout = go.Layout(
        title = 'Coloring ThTh rProteins and rRNA Phases by Community',
        showlegend = True
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [35]:
def plot_louvain(res, G, make_plot=True):
    partition = community.best_partition(G, resolution=res, weight='Energy')
    partition_df = pd.DataFrame.from_dict(partition, orient='index').reset_index()
    partition_df.rename(columns={0:'partition'}, inplace=True)
    ThTh_partition = ThTh_nodes.join(partition_df)
    ThTh_partition = ThTh_partition.drop(['index'], axis=1)
    print('Resolution:', res)
    print('Number of partitions:',len(set(partition.values())))
    print('Modularity:', community.modularity(partition, G))
    if make_plot == True:
        plot_nodes_partitions(ThTh_partition)
    return(partition, ThTh_partition)

In [36]:
louvain5, lv5_df = plot_louvain(5, G_ThTh)

Resolution: 5
Number of partitions: 45
Modularity: 0.7737841228395241


In [22]:
lv5_df

Unnamed: 0,NodeId,Chain,Position,Residue,Dssp,Degree,Bfactor_CA,Rapdf,Tap,Accessibility,x,y,z,Object,partition
0,a:2061:_:G,a,2061,G,,20,-999.90,-999.900,-999.900,-999.900,-41.375000,132.966003,167.074005,Phase1,0
1,a:2062:_:A,a,2062,A,,14,-999.90,-999.900,-999.900,-999.900,-39.689999,134.860992,160.151993,Phase1,0
2,a:2063:_:C,a,2063,C,,9,-999.90,-999.900,-999.900,-999.900,-41.956001,130.835007,155.792007,Phase1,0
3,a:2064:_:C,a,2064,C,,12,-999.90,-999.900,-999.900,-999.900,-47.741001,131.516006,153.901993,Phase1,1
4,a:2065:_:C,a,2065,C,,14,-999.90,-999.900,-999.900,-999.900,-51.830002,136.326996,153.990005,Phase1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6329,8:63:_:PRO,8,63,PRO,T,7,38.36,-11.299,0.091,0.296,-56.248001,181.615005,167.673004,bL35,0
6330,8:64:_:TYR,8,64,TYR,T,11,42.67,-33.017,-0.126,0.464,-55.664001,185.250000,166.755997,bL35,0
6331,8:65:_:GLU,8,65,GLU,,5,65.21,48.469,0.000,0.655,-58.987000,186.423996,165.337997,bL35,14
6332,8:5001:_:MG,8,5001,MG,,15,-999.90,-999.900,-999.900,-999.900,-64.934998,185.035004,155.401993,bL35,14


In [23]:
plot_nodes(lv5_df.loc[lv5_df.partition == 2])

In [24]:
lv5_df.loc[lv5_df.partition == 1]

Unnamed: 0,NodeId,Chain,Position,Residue,Dssp,Degree,Bfactor_CA,Rapdf,Tap,Accessibility,x,y,z,Object,partition
3,a:2064:_:C,a,2064,C,,12,-999.90,-999.900,-999.900,-999.900,-47.741001,131.516006,153.901993,Phase1,1
63,a:2447:_:G,a,2447,G,,21,-999.90,-999.900,-999.900,-999.900,-52.375999,134.701004,169.470001,Phase1,1
64,a:2448:_:A,a,2448,A,,18,-999.90,-999.900,-999.900,-999.900,-56.721001,135.740997,166.720993,Phase1,1
65,a:2449:_:U,a,2449,U,,14,-999.90,-999.900,-999.900,-999.900,-59.466999,134.725006,161.389008,Phase1,1
66,a:2450:_:A,a,2450,A,,17,-999.90,-999.900,-999.900,-999.900,-57.667000,130.154007,158.809998,Phase1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5665,0:4:_:LYS,0,4,LYS,,12,77.31,-33.298,0.507,0.631,-59.555000,124.487999,149.201996,bL27,1
6217,5:13:_:LYS,5,13,LYS,H,17,27.06,-33.943,0.229,0.496,-34.617001,124.774002,200.593994,bL32,1
6218,5:14:_:ALA,5,14,ALA,H,12,24.52,-64.997,-0.211,0.411,-36.792000,122.024002,202.039993,bL32,1
6221,5:17:_:ASP,5,17,ASP,H,15,25.49,-120.910,0.164,0.425,-31.934000,119.652000,202.363998,bL32,1


In [None]:
spring = nx.spring_layout(louvain5)

In [None]:
pgv_test = nx.nx_agraph.to_agraph(G_ThTh)

In [None]:
pgv_test.layout(prog='dot')
pgv_test.draw('test.png')

In [None]:
def community_layout(g, partition):
    """
    Compute the layout for a modular graph.


    Arguments:
    ----------
    g -- networkx.Graph or networkx.DiGraph instance
        graph to plot

    partition -- dict mapping int node -> int community
        graph partitions


    Returns:
    --------
    pos -- dict mapping int node -> (float x, float y)
        node positions

    """

    pos_communities = _position_communities(g, partition, scale=3.)

    pos_nodes = _position_nodes(g, partition, scale=1.)

    # combine positions
    pos = dict()
    for node in g.nodes():
        pos[node] = pos_communities[node] + pos_nodes[node]

    return pos

def _position_communities(g, partition, **kwargs):

    # create a weighted graph, in which each node corresponds to a community,
    # and each edge weight to the number of edges between communities
    between_community_edges = _find_between_community_edges(g, partition)

    communities = set(partition.values())
    hypergraph = nx.DiGraph()
    hypergraph.add_nodes_from(communities)
    for (ci, cj), edges in between_community_edges.items():
        hypergraph.add_edge(ci, cj, weight=len(edges))

    # find layout for communities
    pos_communities = nx.spring_layout(hypergraph, **kwargs)

    # set node positions to position of community
    pos = dict()
    for node, community in partition.items():
        pos[node] = pos_communities[community]

    return pos

def _find_between_community_edges(g, partition):

    edges = dict()

    for (ni, nj) in g.edges():
        ci = partition[ni]
        cj = partition[nj]

        if ci != cj:
            try:
                edges[(ci, cj)] += [(ni, nj)]
            except KeyError:
                edges[(ci, cj)] = [(ni, nj)]

    return edges

def _position_nodes(g, partition, **kwargs):
    """
    Positions nodes within communities.
    """

    communities = dict()
    for node, community in partition.items():
        try:
            communities[community] += [node]
        except KeyError:
            communities[community] = [node]

    pos = dict()
    for ci, nodes in communities.items():
        subgraph = g.subgraph(nodes)
        pos_subgraph = nx.spring_layout(subgraph, **kwargs)
        pos.update(pos_subgraph)

    return pos

def test():
    # to install networkx 2.0 compatible version of python-louvain use:
    # pip install -U git+https://github.com/taynaud/python-louvain.git@networkx2
    from community import community_louvain

    g = nx.karate_club_graph()
    partition = community_louvain.best_partition(g)
    pos = community_layout(g, partition)

    nx.draw(g, pos, node_color=partition.values()); plt.show()
    return

In [None]:
SO = community_layout(G_ThTh, louvain5)

In [None]:
SO

In [None]:
nx.draw(SO)

In [None]:
louvain6, lv6_df = plot_louvain(6, G_ThTh, False)

In [None]:
normalized_mutual_info_score(list(louvain5.values()), list(louvain6.values()))

In [None]:
resolution = np.linspace(1, 20, num=39, endpoint=True, retstep=False, dtype=None)
resolution

In [None]:
def make_prtn_mod_res_df(resolution_list, G):
    modularity_list = []
    partition_list = []
    
    for res in resolution:
        partition = community.best_partition(G, resolution=res, weight='Energy')
        num_partitions = len(set(partition.values()))
        modularity = community.modularity(partition, G)
        modularity_list.append(modularity)
        partition_list.append(num_partitions)
    
    df = pd.DataFrame(
        {'Resolution':resolution_list,
         'Num_Partitions':partition_list,
         'Modularity':modularity_list})
    
    return(df)

### This next cell takes a crazy long time to run, graph below may be an old version

In [None]:
prtn_mod_res_df = make_prtn_mod_res_df(resolution, G_ThTh)

# Create traces
trace0 = go.Scatter(
    x = prtn_mod_res_df['Resolution'],
    y = prtn_mod_res_df['Num_Partitions'],
    mode = 'lines',
    name = 'Partitions'
)
trace1 = go.Scatter(
    x = prtn_mod_res_df['Resolution'],
    y = prtn_mod_res_df['Modularity'],
    mode = 'lines',
    name = 'Modularity',
    yaxis='y2'
)


layout = go.Layout(
    title='Modularity and Parition Number vs. Louvain Resoution',
    xaxis=dict(
        title='Louvain Resolution'
    ),
    yaxis=dict(
        title='Number of Partitions'
    ),
    yaxis2=dict(
        title='Modularity',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    )
)

data = [trace0, trace1]
fig = go.Figure(data=data, layout=layout)
iplot(fig)

### Output makes no sense, all nodes are communities

In [None]:
ig_G = ig.Graph.Read_GraphML('../Ring_output/1VY4/1VY4_LSU_rRNA+rProtein_network.xml')

In [None]:
def walktrap_output(stps):
    walktrap = ig.Graph.community_walktrap(ig_G, weights='Energy', steps=stps)
    print('Steps:', stps)
    print('Optimal count:', walktrap.optimal_count)
    print('Modularity:', ig_G.modularity(membership=walktrap.as_clustering()))
    return([e for l in walktrap.merges for e in l])

In [None]:
walktrap2 = walktrap_output(2)

In [None]:
walktrap4 = walktrap_output(4)

In [None]:
walktrap6 = walktrap_output(6)

In [None]:
walktrap8 = walktrap_output(8)

In [None]:
walktrap10 = walktrap_output(10)

In [None]:
normalized_mutual_info_score(walktrap2, walktrap4)