In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib
import functools
import itertools
import community as community_louvain
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from functools import reduce
from networkx.algorithms import community
pd.show_versions()


INSTALLED VERSIONS
------------------
commit           : b5958ee1999e9aead1938c0bba2b674378807b3d
python           : 3.8.3.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
Version          : 10.0.19041
machine          : AMD64
processor        : Intel64 Family 6 Model 78 Stepping 3, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : Dutch_Belgium.1252

pandas           : 1.1.5
numpy            : 1.18.5
pytz             : 2020.1
dateutil         : 2.8.1
pip              : 20.1.1
setuptools       : 49.2.0.post20200714
Cython           : 0.29.14
pytest           : 5.4.3
hypothesis       : None
sphinx           : 3.1.2
blosc            : None
feather          : None
xlsxwriter       : 1.2.9
lxml.etree       : 4.5.2
html5lib         : 1.1
pymysql          : None
psycopg2         : None
jinja2           : 2.11.2
IPython          : 7.16.1
pandas_datareader: None
bs4              : 4.9.1
bottleneck       : 1.

# Protein Association Network

## Download the data

In [2]:
# Load the dataframes
ass = pd.read_table("http://genesis.ugent.be/uvpublicdata/Tabloid_Proteome/TPSignificantPairInfoHuman.txt", sep = '\t')
uniprot_ids = pd.read_csv(r'D:\\Jana De Coster\\Documents\\Ugent\\2de master\\Master thesis\\Network\\Data interaction network\\Metadata\\Filtered metadata\\Metadata.csv', encoding = 'utf-8', compression = 'gzip', low_memory = False, sep = '\t')

In [3]:
ass.to_csv('D:\\Jana De Coster\\Documents\\Ugent\\2de master\\Master thesis\\Network\\Data interaction network\\PPA data 22_02_2021\\PPA_original.csv', encoding='utf-8', index = False, sep = '\t')

In [4]:
ass.shape

(1904, 17)

The association data has been collected based upon 

## Update the metadata to match the PPI metadata

In [5]:
# Retain only the necessary columns and rename them
ass = ass[['UniprotAccessionA', 
           'UniprotAccessionB', 
           'JaccardCoefficient', 
           'CommonProjectsCount', 
           'CommonProjectNames', 
           'CommonAssayCount']]

# merge the uniprot IDs with the interactions
pa = pd.merge(ass, uniprot_ids, left_on = 'UniprotAccessionA', right_on = 'UniprotAccession', how = 'left')
pa = pd.merge(pa, uniprot_ids, left_on = 'UniprotAccessionB', right_on = 'UniprotAccession', how = 'left')

# Alter the suffixes added
pa.columns = pa.columns.str.replace(r'_x', '_A')
pa.columns = pa.columns.str.replace(r'_y', '_B')

# Combine similar annotation data between the two proteins and save this data in corresponding columns
pa = pa.astype(str)

In [6]:
# Rearrange the columns
pa = pa[['UniprotAccession_A', 
         'Protein_name_A',  
         'Gene_names_A', 
         'UniprotAccession_B', 
         'Protein_name_B',  
         'Gene_names_B',
         'JaccardCoefficient', 
         'CommonProjectsCount', 
         'CommonProjectNames', 
         'CommonAssayCount', 
         'GO_biological_process_A', 
         'GO_cellular_component_A', 
         'GO_molecular_function_A', 
         'Reactome_ID_A', 
         'Reactome_name_A', 
         'HPA_gene_A', 
         'HPA_gene_description_A', 
         'HPA_Subcellular_location_A', 
         'CORUM_complexes_A', 
         'CORUM_subunit_IDs_A', 
         'DisGeNet_disease_name_A', 
         'DisGeNet_disease_ID_A',
         'DisGeNet_class_A', 
         'DisGeNet_gene_A', 
         'DisGeNet_score_A', 
         'DisProt_name_A', 
         'DisProt_ID_A', 
         'GO_biological_process_B', 
         'GO_cellular_component_B', 
         'GO_molecular_function_B', 
         'Reactome_ID_B', 
         'Reactome_name_B', 
         'HPA_gene_B', 
         'HPA_gene_description_B', 
         'HPA_Subcellular_location_B', 
         'CORUM_complexes_B', 
         'CORUM_subunit_IDs_B', 
         'DisGeNet_disease_name_B', 
         'DisGeNet_disease_ID_B', 
         'DisGeNet_class_B', 
         'DisGeNet_gene_B', 
         'DisGeNet_score_B', 
         'DisProt_name_B', 
         'DisProt_ID_B']]
pa = pa.astype(str)

In [7]:
nodes = list(set(list(pa.UniprotAccession_A.unique()) + list(pa.UniprotAccession_B.unique())))
edges = []
for i in range(len(pa)):
    edges.append(tuple([pa.UniprotAccession_A[i], pa.UniprotAccession_B[i]]))
    
# Create empty graph
G = nx.Graph()

# Add the nodes
G.add_nodes_from(nodes)

#Add the edges
G.add_edges_from(edges)

print(G.number_of_nodes())
print(G.number_of_edges())

729
1904


In [8]:
#Save the dataframe both in csv and txt format and  save the uniprot_ids as well
pa.to_csv('D:\\Jana De Coster\\Documents\\Ugent\\2de master\\Master thesis\\Network\\Data interaction network\\PPI data 20_08_2020\\PPI with Uniprot IDs\\PA_nodes.csv', encoding = 'utf-8', compression = 'gzip', index = False, sep = '\t')
pa.to_csv('D:\\Jana De Coster\\Documents\\Ugent\\2de master\\Master thesis\\Network\\Data interaction network\\PPI data 20_08_2020\\PPI with Uniprot IDs\\PA_nodes.txt', encoding = 'utf-8', compression = 'gzip', index = False, sep = '\t')

# Second protein association network (latest version)

In [14]:
ass2 = pd.read_csv(r'D:\\Jana De Coster\\Documents\\Ugent\\2de master\\Master thesis\\Network\\Data interaction network\\PPA data 22_02_2021\\FinalPair_ParaCompxBGGOKrIg0.4Jaccard20Jan21.txt', sep = '\t')
uniprot_ids = pd.read_csv(r'D:\\Jana De Coster\\Documents\\Ugent\\2de master\\Master thesis\\Network\\Data interaction network\\Metadata\\Filtered metadata\\Metadata.csv', encoding = 'utf-8', compression = 'gzip', low_memory = False, sep = '\t')

In [15]:
# Retain only the necessary columns and rename them
ass2 = ass2[['protein1', 
             'protein2', 
             'Jaccard', 
             'assay1', 
             'assay2', 
             'CommAssay']]
# merge the uniprot IDs with the interactions
pa2 = pd.merge(ass2, uniprot_ids, left_on = 'protein1', right_on = 'UniprotAccession', how = 'left')
pa2 = pd.merge(pa2, uniprot_ids, left_on = 'protein2', right_on = 'UniprotAccession', how = 'left')

# Alter the suffixes added
pa2.columns = pa2.columns.str.replace(r'_x', '_A')
pa2.columns = pa2.columns.str.replace(r'_y', '_B')

# Combine similar annotation data between the two proteins and save this data in corresponding columns
pa2 = pa2.astype(str)

# Rename some columns:
pa2.rename(columns = {'Jaccard' : 'JaccardCoefficient', 
                      'assay1' : 'Assay_A', 
                      'assay2' : 'Assay_B', 
                      'CommAssay' : 'CommonAssayCount'}, inplace = True)

In [16]:
# Rearrange the columns
pa2 = pa2[['UniprotAccession_A', 
         'Protein_name_A',  
         'Gene_names_A', 
         'UniprotAccession_B', 
         'Protein_name_B',  
         'Gene_names_B',
         'JaccardCoefficient',
         'CommonAssayCount', 
         'GO_biological_process_A', 
         'GO_cellular_component_A', 
         'GO_molecular_function_A', 
         'Reactome_ID_A', 
         'Reactome_name_A', 
         'HPA_gene_A', 
         'HPA_gene_description_A', 
         'HPA_Subcellular_location_A', 
         'CORUM_complexes_A', 
         'CORUM_subunit_IDs_A', 
         'DisGeNet_disease_name_A', 
         'DisGeNet_disease_ID_A',
         'DisGeNet_class_A', 
         'DisGeNet_gene_A', 
         'DisGeNet_score_A', 
         'DisProt_name_A', 
         'DisProt_ID_A', 
         'GO_biological_process_B', 
         'GO_cellular_component_B', 
         'GO_molecular_function_B', 
         'Reactome_ID_B', 
         'Reactome_name_B', 
         'HPA_gene_B', 
         'HPA_gene_description_B', 
         'HPA_Subcellular_location_B', 
         'CORUM_complexes_B', 
         'CORUM_subunit_IDs_B', 
         'DisGeNet_disease_name_B', 
         'DisGeNet_disease_ID_B', 
         'DisGeNet_class_B', 
         'DisGeNet_gene_B', 
         'DisGeNet_score_B', 
         'DisProt_name_B', 
         'DisProt_ID_B']]

In [17]:
# Remove columns with nan values in the UniProt ids
pa2 = pa2.astype(str)
for i in range(pa2.shape[0]):
    if pa2.UniprotAccession_A[i] == 'nan' or pa2.UniprotAccession_B[i] == 'nan':
        pa2 = pa2.drop(i)
pa2.reset_index(drop = True, inplace = True)
pa2 = pa2.astype(str)

In [18]:
nodes = list(set(list(pa2.UniprotAccession_A.unique()) + list(pa2.UniprotAccession_B.unique())))
edges = []
for i in range(len(pa2)):
    edges.append(tuple([pa2.UniprotAccession_A[i], pa2.UniprotAccession_B[i]]))
    
# Create empty graph
G2 = nx.Graph()

# Add the nodes
G2.add_nodes_from(nodes)

#Add the edges
G2.add_edges_from(edges)

print(G2.number_of_nodes())
print(G2.number_of_edges())

3650
152800


In [13]:
#Save the dataframe both in csv and txt format and  save the uniprot_ids as well
pa2.to_csv('D:\\Jana De Coster\\Documents\\Ugent\\2de master\\Master thesis\\Network\\Data interaction network\\PPI data 20_08_2020\\PPI with Uniprot IDs\\PA_latest_nodes.csv', encoding = 'utf-8', compression = 'gzip', index = False, sep = '\t')
pa2.to_csv('D:\\Jana De Coster\\Documents\\Ugent\\2de master\\Master thesis\\Network\\Data interaction network\\PPI data 20_08_2020\\PPI with Uniprot IDs\\PA_latest_nodes.txt', encoding = 'utf-8', compression = 'gzip', index = False, sep = '\t')