In [1]:
import networkx as nx
import pandas as pd
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import csv
import os

In [2]:
string_addr = "../data/9606.protein.links.v11.5.noscores.threshold400.txt"
with open(string_addr) as network_f:
    string_network_edges = [x.strip().split(',') for x in network_f.readlines()]

# STRING edgelist to network
G = nx.Graph(string_network_edges)

In [3]:
def import_and_filter(filepath, G):
    data = pd.read_csv(filepath)
    proteins = set(data["x"])
    return [protein for protein in proteins if protein in G.nodes]

In [4]:
directory_path = "../data/CDAGs"
sorted_files = sorted(os.listdir(directory_path))
print(sorted_files)

['B_cells_markers.csv', 'CAGs.csv', 'CD4_markers.csv', 'CD8_markers.csv', 'Clark_CNV_GAIN.csv', 'Clark_CNV_LOSS.csv', 'Clark_Phospho_DOWN.csv', 'Clark_Phospho_UP.csv', 'Clark_Prot_DOWN.csv', 'Clark_Prot_UP.csv', 'Clark_RNA_DOWN.csv', 'Clark_RNA_UP.csv', 'Common_essentials.csv', 'Endo_markers.csv', 'Epi_markers.csv', 'FPGs.csv', 'Fibro_markers.csv', 'Macro_markers.csv', 'Mast_markers.csv', 'Mono_markers.csv', 'NK_markers.csv', 'PT_PRAP_markers.csv', 'Plasma_markers.csv', 'RCC_TFs_DOWN.csv', 'RCC_TFs_UP.csv', 'RCC_vs_PRAP_DOWN.csv', 'RCC_vs_PRAP_UP.csv', 'Treg_markers.csv', 'Tumour_markers.csv', 'UPGs.csv', 'cDC1_markers.csv', 'cDC2_markers.csv', 'pDC_markers.csv']


In [5]:
genes = {}
for filename in sorted_files:
    if filename.endswith(".csv"):
        full_path = os.path.join(directory_path, filename)
        set_name = filename.split(".")[0]
        genes[set_name] = import_and_filter(full_path, G)

In [6]:
DTGs_path = "../data/DTGs/DTGs_class1_202Prot.csv"
genes["DTGs"] = import_and_filter(DTGs_path, G)

In [7]:
all_proteins_from_sets = set().union(*genes.values())
remaining_proteins_in_G = [node for node in G.nodes() if node not in all_proteins_from_sets]
print(len(remaining_proteins_in_G))
genes["Others"] = remaining_proteins_in_G

11108


In [8]:
def compute_ratios(gene, G, gene_sets):
    neighbours = list(G.neighbors(gene))
    counts = {set_name: 0 for set_name in gene_sets.keys()}

    for neighbour in neighbours:
        for set_name, genes_in_set in gene_sets.items():
            if neighbour in genes_in_set:
                counts[set_name] += 1
                

    total_neighbours = len(neighbours)
    if total_neighbours == 0:
        return {set_name: 0 for set_name in gene_sets.keys()}
        
    ratios = {set_name: count / total_neighbours for set_name, count in counts.items()}

    return ratios



ratios_for_all_genes = {}

for gene in G.nodes():
    ratios_for_all_genes[gene] = compute_ratios(gene, G, genes)

# Write the results to file
with open('outputs/network_neighbourhood.csv', 'w', newline='') as csvfile:
    fieldnames = ['Gene'] + list(genes.keys())
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for gene in G.nodes():
        row = {'Gene': gene}

        # Initialize the row with zeros for each gene set
        row.update({set_name: 0 for set_name in genes.keys()})

        if gene in ratios_for_all_genes:
            row.update(ratios_for_all_genes[gene])

        writer.writerow(row)