In [1]:
import gzip
import pandas as pd

l1000_gene_ids = []
l1000_string_ids = []

string_ids = pd.read_csv('gene_mapping/L1000_to_STRING.tab', sep='\t').iloc[:, 1].tolist()

string_interactions = {}

# download: https://stringdb-static.org/download/protein.links.detailed.v12.0/9606.protein.links.detailed.v12.0.txt.gz
with gzip.open('9606.protein.links.detailed.v12.0.txt.gz', 'rt') as f:
    # header
    f.readline()
    
    # for fast search
    _string_ids = set(string_ids)

    for line in f:
        columns = line.strip().split(' ')
        if columns[0] in _string_ids and columns[1] in _string_ids:
            if (columns[0], columns[1]) in string_interactions:
                raise ValueError(f'Duplicate interaction: {columns[0]} - {columns[1]}')
            string_interactions[(columns[0], columns[1])] = float(columns[-1])



In [2]:
import networkx as nx
from collections import defaultdict

biogrid_interactions = defaultdict(int)
# download: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.230/BIOGRID-ORGANISM-4.4.230.tab3.zip
with open('BIOGRID-ORGANISM-Homo_sapiens-4.4.230.tab3.txt', 'r') as f:
    # header
    f.readline()

    g1, g2 = 7, 8
    # score = 18
    for line in f:
        line = line.split('\t')

        organism_interactor_A = line[-1].strip()
        organism_interactor_B = line[-2].strip()

        if organism_interactor_A == 'Homo sapiens' and organism_interactor_B == 'Homo sapiens':
            biogrid_interactions[(line[g1].strip(), line[g2].strip())] += 1



biogrid_graph = nx.Graph()
biogrid_graph.add_edges_from(biogrid_interactions.keys())

In [None]:
gene_id_to_string_id = pd.read_csv('gene_mapping/L1000_to_STRING.tab', sep='\t', index_col=0).iloc[:, 0].to_dict()
gene_name_to_id = pd.read_csv('gene_mapping/L1000.tab', sep='\t').set_index('pr_gene_symbol')['pr_gene_id'].to_dict()

def get_string_evidence_code(tcga_project: str, interaction_type: str):
    df = pd.read_csv(f'results/top_hits/{tcga_project}_{interaction_type}.csv', sep=',')

    def evidence(g1, g2):
        try:
            g1 = gene_id_to_string_id[gene_name_to_id[g1]]
            g2 = gene_id_to_string_id[gene_name_to_id[g2]]
        except KeyError:
            print(f'Gene not found: {g1} or {g2}')
            return 0

        # if (g1, g2) not in string_interactions:
        #     raise ValueError("asd")
        
        string_interaction_evidence = string_interactions.get((g1, g2), 0)

        return 1 if string_interaction_evidence else 0

        # return True if string_interaction_evidence is not None else False

    df['string_evidence'] = df.apply(lambda x: evidence(x['feature1'], x['feature2']), axis=1)

    df.to_csv(f'results/top_hits/{tcga_project}_{interaction_type}.csv', sep=',', index=False)


def get_biogrid_evidence(tcga_project: str, interaction_type: str):
    df = pd.read_csv(f'results/top_hits/{tcga_project}_{interaction_type}.csv', sep=',')

    def evidence(g1, g2):
        e = biogrid_interactions.get((g1, g2), 0) + biogrid_interactions.get((g2, g1), 0)
        # return biogrid_interactions.get((g1, g2), 0) + biogrid_interactions.get((g2, g1), 0)
        return 1 if e else 0

    df['biogrid_evidence'] = df.apply(lambda x: evidence(x['feature1'], x['feature2']), axis=1)

    df.to_csv(f'../analysis/results/top_hits/{tcga_project}_{interaction_type}.csv', sep=',', index=False)


tcga_projects = ['TCGA-BLCA','TCGA-BRCA', 'TCGA-COAD', 
                 'TCGA-GBM', 'TCGA-HNSC', 'TCGA-KIRC', 
                 'TCGA-LGG', 'TCGA-LIHC', 'TCGA-LUAD', 
                 'TCGA-LUSC', 'TCGA-OV', 'TCGA-SKCM', 
                 'TCGA-STAD']

for tcga in tcga_projects:
    get_string_evidence_code(tcga, 'additive')
    get_string_evidence_code(tcga, 'competing')
    get_string_evidence_code(tcga, 'xor')

    get_biogrid_evidence(tcga, 'additive')
    get_biogrid_evidence(tcga, 'competing')
    get_biogrid_evidence(tcga, 'xor')



In [3]:
for tcga in tcga_projects:
    df_additive = pd.read_csv(f'../analysis/results/top_hits/{tcga}_additive.csv', sep=',')
    df_competing = pd.read_csv(f'../analysis/results/top_hits/{tcga}_competing.csv', sep=',')
    df_xor = pd.read_csv(f'../analysis/results/top_hits/{tcga}_xor.csv', sep=',')

    # count non zero values
    additive_evidence = (df_additive["string_evidence"] + df_additive["biogrid_evidence"]).astype(bool).sum()
    competing_evidence = (df_competing["string_evidence"] + df_competing["biogrid_evidence"]).astype(bool).sum()
    xor_evidence = (df_xor["string_evidence"] + df_xor["biogrid_evidence"]).astype(bool).sum()

    print(f'{tcga} - additive: {additive_evidence}, competing: {competing_evidence}, xor: {xor_evidence}')
    

TCGA-BLCA - additive: 2, competing: 11, xor: 1
TCGA-BRCA - additive: 2, competing: 8, xor: 4
TCGA-COAD - additive: 0, competing: 3, xor: 0
TCGA-GBM - additive: 0, competing: 4, xor: 1
TCGA-HNSC - additive: 4, competing: 28, xor: 6
TCGA-KIRC - additive: 11, competing: 271, xor: 26
TCGA-LGG - additive: 22, competing: 200, xor: 1
TCGA-LIHC - additive: 6, competing: 8, xor: 1
TCGA-LUAD - additive: 7, competing: 7, xor: 2
TCGA-LUSC - additive: 0, competing: 1, xor: 0
TCGA-OV - additive: 1, competing: 3, xor: 1
TCGA-SKCM - additive: 3, competing: 11, xor: 1
TCGA-STAD - additive: 1, competing: 2, xor: 0


# Check literature evidence for random pairs of genes

In [4]:
import random
import pandas as pd

l1000_genes = []
string_ids = []

with open('gene_mapping/L1000_to_STRING.tab', 'r') as file:
    for line in file:
        gene, string_id = line.rstrip().split('\t')
        l1000_genes.append(gene)
        string_ids.append(string_id)

gene_to_stringId = dict(zip(l1000_genes, string_ids))


df_l1000 = pd.read_csv('gene_mapping/L1000.tab', sep='\t')
gene_id_to_symbol = df_l1000.set_index("pr_gene_id")["pr_gene_symbol"].to_dict()
gene_id_to_symbol

random_interactions_dist = []
expected_random_interactions_dist = []

for i in range(10_000):
    random_interactions = []
    
    for _ in range(len(l1000_genes)):
        gene1, gene2 = random.sample(l1000_genes, 2)

        stringId1 = gene_to_stringId[gene1]
        stringId2 = gene_to_stringId[gene2]
        gene1Symbol = gene_id_to_symbol[int(gene1)]
        gene2Symbol = gene_id_to_symbol[int(gene2)]

        string_interaction_evidence = string_interactions.get((stringId1, stringId2), None)
        biogrid_interaction_evidence = biogrid_interactions.get((gene1Symbol, gene2Symbol), 0) + biogrid_interactions.get((gene2Symbol, gene1Symbol), 0)

        random_interactions.append((f'{gene1Symbol}-{gene2Symbol}',string_interaction_evidence,biogrid_interaction_evidence))

    df = pd.DataFrame(random_interactions, columns=['interaction', 'string_interaction_evidence', 'biogrid_interaction_evidence'])
    df_string_interactions = set(df[df['string_interaction_evidence'].notna()]['interaction'].values)
    df_biogrid_interactions = set(df[df['biogrid_interaction_evidence'] > 0]['interaction'].values)


    # print(len(set(df_string_interactions)) , len((df_biogrid_interactions)))

    total = df_string_interactions.union(df_biogrid_interactions)
    random_interactions_dist.append(len(total))
    expected_random_interactions_dist.append((len(total) / len(l1000_genes)) * 100)