In [1]:
import networkx as nx
import pandas as pd
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import csv
import os

In [2]:
string_addr = "../data/9606.protein.links.v11.5.noscores.threshold400.txt"
with open(string_addr) as network_f:
    string_network_edges = [x.strip().split(',') for x in network_f.readlines()]

# STRING edgelist to network
G = nx.Graph(string_network_edges)

In [3]:
def import_and_filter(filepath, G):
    data = pd.read_csv(filepath)
    proteins = set(data["x"])
    return [protein for protein in proteins if protein in G.nodes]

In [4]:
directory_path = "../data/CDAGs"
sorted_files = sorted(os.listdir(directory_path))
print(sorted_files)

['B_cells_markers.csv', 'CAGs.csv', 'CD4_markers.csv', 'CD8_markers.csv', 'Clark_CNV_GAIN.csv', 'Clark_CNV_LOSS.csv', 'Clark_Phospho_DOWN.csv', 'Clark_Phospho_UP.csv', 'Clark_Prot_DOWN.csv', 'Clark_Prot_UP.csv', 'Clark_RNA_DOWN.csv', 'Clark_RNA_UP.csv', 'Common_essentials.csv', 'Endo_markers.csv', 'Epi_markers.csv', 'FPGs.csv', 'Fibro_markers.csv', 'Macro_markers.csv', 'Mast_markers.csv', 'Mono_markers.csv', 'NK_markers.csv', 'PT_PRAP_markers.csv', 'Plasma_markers.csv', 'RCC_TFs_DOWN.csv', 'RCC_TFs_UP.csv', 'RCC_vs_PRAP_DOWN.csv', 'RCC_vs_PRAP_UP.csv', 'Treg_markers.csv', 'Tumour_markers.csv', 'UPGs.csv', 'cDC1_markers.csv', 'cDC2_markers.csv', 'pDC_markers.csv']


In [5]:
sets = {}
for filename in sorted_files:
    if filename.endswith(".csv"):
        full_path = os.path.join(directory_path, filename)
        set_name = filename.split(".")[0]
        sets[set_name] = import_and_filter(full_path, G)

In [6]:
DTGs_path = "../data/DTGs/DTGs_class1_202Prot.csv"
sets["DTGs"] = import_and_filter(DTGs_path, G)

In [7]:
all_proteins_from_sets = set().union(*sets.values())
remaining_proteins_in_G = [node for node in G.nodes() if node not in all_proteins_from_sets]
print(len(remaining_proteins_in_G))
sets["Others"] = remaining_proteins_in_G

11108


In [8]:
#Calculate average and minimum shortest paths to signatures
fieldnames = ['Gene'] + [f"Mean_{set_name}" for set_name in sets.keys()] + [f"Min_{set_name}" for set_name in sets.keys()]
with open('outputs/network_distances.csv', 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for gene in G.nodes():
        row = {'Gene': gene}
        for set_name, proteins in sets.items():
            distances = [nx.shortest_path_length(G, source=gene, target=p) for p in proteins if nx.has_path(G, gene, p)]
            if distances:
                row[f"Mean_{set_name}"] = np.mean(distances)
                row[f"Min_{set_name}"] = np.min(distances)
            else:
                row[f"Mean_{set_name}"] = np.inf
                row[f"Min_{set_name}"] = np.inf
        writer.writerow(row)