# Data gathering

In this part, we will gather as much as possible proteomics data from published literature and databases. Here are some sources:

1. CyanoMapDB, this is a database providing cyanobacterial PPIs with experimental evidence, consisting of 52,304 PPIs among 6,789 proteins from 23 cyanobacterial species. It collected available data in UniProt, STRING, and IntAct, and mined numerous PPIs from co-fractionation MS data in cyanobacteria.
2. Native Protein Complexes in Synechocystis sp. PCC 6803, Comparative Network Biology Discovers Protein Complexes That Underline Cellular Differentiation in Anabaena sp.; These two papers talk about how to construct the complexes using interactome data and clustering method. Our approach will be the same to use the same clustering method to analyze the CyanoMapDB interactome data of S. elongatus PCC7942
3. There are some cofraction proteomics data of S. elongatus PCC7942 reported from the paper: "Monitoring light/dark association dynamics of multi-protein complexes in cyanobacteria using size exclusion chromatography-based proteomics". The CyanoMapDB included this dataset and use it predicted the interactome.

In [1]:
import pandas as pd
import os, sys, re
from pathlib import Path
home = str(Path.home())

In [2]:
work_dir = home + "/Dropbox/PNNL/PredPheno/SystemModeling/Modeling/S_elongatus"
# work_dir

In [3]:
proteome = pd.read_excel(work_dir + "/data/interactome/Synechococcus_PCC_7942_Dataset.xlsx", sheet_name="Protein")
interactome = pd.read_excel(work_dir + "/data/interactome/Synechococcus_PCC_7942_Dataset.xlsx", sheet_name="PPI")


In [4]:
interactome

Unnamed: 0,Protein A,Protein B,Taxon,UniProt evidence,STRING score,IntAct score,IntAct method,GS complex evidence,CF-MS score,CF-MS ID A,CF-MS ID B,PPI index
0,O05161,P16954,Synechococcus elongatus (strain PCC 7942 / FAC...,,,,,,0.651,1G6TR,1G1IW,0.651
1,O05161,Q31KN7,Synechococcus elongatus (strain PCC 7942 / FAC...,,,,,,0.595,1G6TR,1G152,0.595
2,O05161,Q31LJ5,Synechococcus elongatus (strain PCC 7942 / FAC...,,,,,,0.694,1G6TR,1G2EH,0.694
3,O05161,Q31LM9,Synechococcus elongatus (strain PCC 7942 / FAC...,,,,,,0.639,1G6TR,1G4ZP,0.639
4,O05161,Q31N38,Synechococcus elongatus (strain PCC 7942 / FAC...,,,,,,0.647,1G6TR,1GCIT,0.647
...,...,...,...,...,...,...,...,...,...,...,...,...
4529,Q8GMR7,Q99QJ5,Synechococcus elongatus (strain PCC 7942 / FAC...,,,,,,0.598,1G0PI,1G030,0.598
4530,Q8GMT0,Q935Z3,Synechococcus elongatus (strain PCC 7942 / FAC...,,,,,,0.849,1G6TX,1G1IA,0.849
4531,Q8KPQ0,Q9L4P3,Synechococcus elongatus (strain PCC 7942 / FAC...,,,,,,0.675,1G0ND,1G126,0.675
4532,Q8KPU9,Q8L1E5,Synechococcus elongatus (strain PCC 7942 / FAC...,,,,,,0.696,1G15V,1G1X2,0.696


In [5]:
cl1_input = interactome[["Protein A", "Protein B", "PPI index"]]
input_file = work_dir + "/data/interactome/ClusterONE_input.tsv"
cl1_input.to_csv(input_file, sep='\t', index=None, header=None)

In [6]:
import subprocess
output_file = work_dir + "/data/interactome/ClusterONE_output.tsv"
output_file_csv = work_dir + "/data/interactome/ClusterONE_output.csv"
with open(output_file, 'w') as output:
    subprocess.run(["java", "-jar", home + "/Dropbox/PNNL/PredPheno/SystemModeling/tools/cluster_one-1.2.jar", "-s", "3", "-d", "0.5", "-f", "edge_list", "-F", "plain", input_file], stdout=output)
with open(output_file_csv, 'w') as output:
    subprocess.run(["java", "-jar", home + "/Dropbox/PNNL/PredPheno/SystemModeling/tools/cluster_one-1.2.jar", "-s", "3", "-d", "0.5", "-f", "edge_list", "-F", "csv", input_file], stdout=output)

Loaded graph with 679 nodes and 4534 edges
Detected 80 complexes
Loaded graph with 679 nodes and 4534 edges
Detected 80 complexes


In [7]:
sys.path.append("./data/interactome")
import utils3 as util3

In [8]:
import GoldStandard as GS
# Evaluating predicted clusters
pred_clusters = GS.Clusters(False)
pred_clusters.read_file(output_file)


Average size of predicted complexes is: 7.5


In [9]:
network_df = pd.read_csv(input_file, sep='\t', header=None)
network_df.shape

(4534, 3)

In [10]:
network_df.columns = ["ProtA", "ProtB", "Score"]

In [11]:
complex_idx = []
complex_num = []
clusters = pred_clusters
for i in range(network_df.shape[0]):
    complex_idx_per_edge = []
    for complex in clusters.complexes:
        prots = list(clusters.complexes[complex])
        if network_df.iloc[i,0] in prots and network_df.iloc[i,1] in prots:
            complex_idx_per_edge.append(complex)
    complex_num.append(len(complex_idx_per_edge))
    complex_idx.append(complex_idx_per_edge)
complexes_df = network_df.copy()
complexes_df["Complex idx"] = complex_idx
complexes_df["Complex num"] = complex_num
complexes_df

Unnamed: 0,ProtA,ProtB,Score,Complex idx,Complex num
0,O05161,P16954,0.651,[],0
1,O05161,Q31KN7,0.595,[],0
2,O05161,Q31LJ5,0.694,[],0
3,O05161,Q31LM9,0.639,[],0
4,O05161,Q31N38,0.647,[],0
...,...,...,...,...,...
4529,Q8GMR7,Q99QJ5,0.598,[],0
4530,Q8GMT0,Q935Z3,0.849,[],0
4531,Q8KPQ0,Q9L4P3,0.675,[],0
4532,Q8KPU9,Q8L1E5,0.696,[50],1


In [12]:
set(complexes_df["Complex num"])

{0, 1, 2, 3, 4}

In [13]:
complexes = complexes_df[complexes_df["Complex num"] > 0].copy()
print(complexes.shape)
complexes = complexes.explode("Complex idx")
complexes.head(20)

(2071, 5)


Unnamed: 0,ProtA,ProtB,Score,Complex idx,Complex num
11,O06865,P43087,0.811,14,2
11,O06865,P43087,0.811,21,2
12,O06865,P50021,0.795,21,1
13,O06865,Q31KE9,0.716,14,2
13,O06865,Q31KE9,0.716,21,2
27,O06865,Q59984,0.835,14,2
27,O06865,Q59984,0.835,21,2
28,O06865,Q79N42,0.618,14,2
28,O06865,Q79N42,0.618,21,2
29,O06865,Q7X4K8,0.714,14,2


In [14]:
complexes["Source"] = complexes["ProtA"] + '@Complex_' + complexes["Complex idx"].astype(str)
complexes["Target"] = complexes["ProtB"] + '@Complex_' + complexes["Complex idx"].astype(str)

In [15]:
complexes

Unnamed: 0,ProtA,ProtB,Score,Complex idx,Complex num,Source,Target
11,O06865,P43087,0.811,14,2,O06865@Complex_14,P43087@Complex_14
11,O06865,P43087,0.811,21,2,O06865@Complex_21,P43087@Complex_21
12,O06865,P50021,0.795,21,1,O06865@Complex_21,P50021@Complex_21
13,O06865,Q31KE9,0.716,14,2,O06865@Complex_14,Q31KE9@Complex_14
13,O06865,Q31KE9,0.716,21,2,O06865@Complex_21,Q31KE9@Complex_21
...,...,...,...,...,...,...,...
4524,Q79PF6,Q79PF6,0.998,56,1,Q79PF6@Complex_56,Q79PF6@Complex_56
4525,Q79PF6,Q8GLI4,1.000,56,1,Q79PF6@Complex_56,Q8GLI4@Complex_56
4526,Q8GJM0,Q9Z3G5,0.707,2,1,Q8GJM0@Complex_2,Q9Z3G5@Complex_2
4532,Q8KPU9,Q8L1E5,0.696,50,1,Q8KPU9@Complex_50,Q8L1E5@Complex_50


In [16]:
import networkx as nx

In [17]:
G=nx.from_pandas_edgelist(complexes, "Source", "Target", 'Score')


In [18]:
# G.nodes

In [19]:
import random
import matplotlib.pyplot as plt
plt.style.use('default')
plt.rcParams['figure.facecolor'] = 'white'

plt.figure(1, figsize=(8, 8))
# layout graphs with positions using graphviz neato
pos = nx.nx_agraph.graphviz_layout(G, prog="neato")
# color nodes the same in each connected subgraph
C = (G.subgraph(c) for c in nx.connected_components(G))
for i,g in enumerate(C):
    # g.graph_attr_dict_factory()["label"] = "Complex_" + str(i)
    c = [random.random()] * nx.number_of_nodes(g)  # random color...
    nx.draw(g, pos, node_size=40, node_color=c, vmin=0.0, vmax=1.0, with_labels=False)
# plt.show()
plt.savefig(work_dir + '/data/interactome/complexes.pdf', bbox_inches='tight')

In [25]:
from ipycytoscape import *
cyto = CytoscapeWidget()

In [26]:
[cyto.graph.add_graph_from_networkx(G.subgraph(c)) for c in nx.connected_components(G)];

In [27]:
display(cyto)

CytoscapeWidget(cytoscape_layout={'name': 'cola'}, cytoscape_style=[{'selector': 'node', 'css': {'background-c…

In [None]:
cyto.set_style([{
                        'selector': 'node',
                        'css': {
                            'content': 'data(name)',
                            'text-valign': 'center',
                            'color': 'white',
                            'text-outline-width': 2,
                            'text-outline-color': 'green',
                            'background-color': 'green'
                        }
                        },
                        {
                        'selector': ':selected',
                        'css': {
                            'background-color': 'black',
                            'line-color': 'black',
                            'target-arrow-color': 'black',
                            'source-arrow-color': 'black',
                            'text-outline-color': 'black'
                        }}
                        ])

In [None]:
display(cyto)

CytoscapeWidget(cytoscape_layout={'name': 'cola'}, cytoscape_style=[{'selector': 'node', 'css': {'content': 'd…