In [None]:
import json
import time
import gzip
import requests
import pandas as pd


df = pd.read_csv('gene_mapping/biomart_export.tab', sep='\t')
gene_symbol_to_gene_id = dict(zip(df.iloc[:, 3], df.iloc[:, 2]))
gene_id_to_gene_symbol = dict(zip(df.iloc[:, 2], df.iloc[:, 3]))
gene_id_to_uniprot_id = dict(zip(df.iloc[:, 2], df.iloc[:, 5]))


l1000 = pd.read_csv('gene_mapping/L1000.tab', sep='\t')
l1000_to_gene_id = dict(zip(l1000.iloc[:, 1], l1000.iloc[:, 0]))

def symbol_to_gene_id(gene: str):
    gene_id = l1000_to_gene_id.get(gene, None)
    if gene_id is None:
        gene_id = gene_symbol_to_gene_id.get(gene, None)
    return gene_id


# we can fetch each uniprot entry individually with:
# response = requests.get(f'https://rest.uniprot.org/uniprotkb/{uniprotkb_assension}.json')
# protein_data = response.json()

# We created a dump of all uniprot entries for faster access
with gzip.open('uniprotkb_reviewed_true_AND_organism_id_2024_02_15.json.gz', 'rt') as f: 
    uniprot_data = json.load(f)['results']
    uniprot_data = {entry['primaryAccession']: entry for entry in uniprot_data}

def get_uniprot_data(uniprotkb_assension):
    protein_data = uniprot_data[uniprotkb_assension]

    context_data = {
        'Function': '',
        'Tissue Specificity': '',
        'Subunit Interactions': '',
        'Binary Interactions': [],
        'Biological Process': [],
        'Molecular Function': [],
        'Cellular Component': []
    }
        
    for comment in protein_data['comments'] if 'comments' in protein_data else []:
        if comment['commentType'] == 'FUNCTION':
            context_data['Function'] = comment['texts'][0]['value']

        if comment['commentType'] == 'SUBUNIT':
            context_data['Subunit Interactions'] = comment['texts'][0]['value']

        if comment['commentType'] == 'TISSUE SPECIFICITY':
            context_data['Tissue Specificity'] = comment['texts'][0]['value']

        if comment['commentType'] == 'INTERACTION':
            context_data['Binary Interactions'] = [interaction['interactantTwo']['geneName'] for interaction in comment['interactions'] if 'geneName' in interaction['interactantTwo']]

    go_aspect_mapper = {'P': 'Biological Process', 'F': 'Molecular Function', 'C': 'Cellular Component'}
                    
    for ref in protein_data['uniProtKBCrossReferences']:
        if ref['database'] == 'GO':
            aspect_short, term = ref['properties'][0]['value'].split(':', maxsplit=1)
            aspect_long = go_aspect_mapper[aspect_short]
            context_data[aspect_long].append(term)
    # time.sleep(0.5)
    return context_data

In [2]:
import networkx as nx
import gzip
import pandas as pd

from collections import defaultdict

biogrid_interactions = defaultdict(int)
# download: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.230/BIOGRID-ORGANISM-4.4.230.tab3.zip
with open('BIOGRID-ORGANISM-Homo_sapiens-4.4.230.tab3.txt', 'r') as f:
    # header
    f.readline()

    g1, g2 = 7, 8
    for line in f:
        line = line.split('\t')
        organism_interactor_A = line[-1].strip()
        organism_interactor_B = line[-2].strip()

        if organism_interactor_A == 'Homo sapiens' and organism_interactor_B == 'Homo sapiens':
            biogrid_interactions[(line[g1].strip(), line[g2].strip())] += 1


biogrid_graph = nx.Graph()
biogrid_graph.add_edges_from(biogrid_interactions.keys())


df_string_ids = pd.read_csv('gene_mapping/gene_id_to_STRING_id.tab', sep='\t')
string_id_to_gene_id = dict(zip(df_string_ids.iloc[:, 1], df_string_ids.iloc[:, 0]))

string_interactions = {}
# download: https://stringdb-static.org/download/protein.links.detailed.v12.0/9606.protein.links.detailed.v12.0.txt.gz
with gzip.open('9606.protein.links.detailed.v12.0.txt.gz', 'rt') as f:
    # header
    f.readline()
    
    # for fast search
    _string_ids = set(df_string_ids.iloc[:, 1].tolist())

    for line in f:
        columns = line.strip().split(' ')
        gene_a_string = columns[0]
        gene_b_string = columns[1]

        if gene_a_string in _string_ids and gene_b_string in _string_ids:

            gene_a_id = string_id_to_gene_id[gene_a_string]
            gene_b_id = string_id_to_gene_id[gene_b_string]

            gene_a_symbol = gene_id_to_gene_symbol[gene_a_id]
            gene_b_symbol = gene_id_to_gene_symbol[gene_b_id]


            string_interactions[(gene_a_symbol, gene_b_symbol)] = float(columns[-1])

string_graph = nx.Graph()
string_graph.add_edges_from(string_interactions.keys())


In [None]:
import tiktoken
from openai import OpenAI

client = OpenAI(api_key="sk-...")

def make_request_to_llm(**kwargs):
    return client.chat.completions.create(**kwargs)


system_prompt = "[TASK INSTRUCTIONS]\nYou are a helpful domain expert with a background in biology. You know the biology of all genes known in the literature. You are able to either explain or reason about how two genes might be functionally connected, even if there is no evidence of direct physical interaction.\n\nYou will be given:\n<Context in JSON>\n<Instruction: Hypothesize that %s and %s interact ...>\n\nAssume that %s and %s interact. Given their function in a cell and their neighbourhood of physical interactions provide the most likely scenarion of these two genes in a functional interaction. This can either be physical interaction, effects on molecular pathways or biological processes in the context of cancer.\n\nGiven your hypotesis, provide a detailed justification that is experimentaly validated and factual. In you justification be very speficic in how those two genes might be associated, here is one such example: If up-regulation of gene A promotes process X by activating gene C and up-regulation of gene B inhibits process X by inhibiting gene C we would consider genes A and B to be in a interaction. Use this as an example, there might be numerous different scenarios.\n\n You will end up with a probability (a number between 0 and 100). First, assume that there is no interaction between %s and %s (likelihood 0). After considering your answer, how would you rate the likelihood of this interaction, a number between 0 and 100. \n\n[CONTEXT FORMAT]\nThe JSON objects will structure the information as follows for each gene:\n{\n  \"GENE A\": {\n    \"Function\": \"General function(s) of a protein\",\n    \"Tissue Specificity\": “Information on the expression of a gene at the mRNA or protein level in cells or in tissues of multicellular organisms\",\n    \"Subunit Interactions\": “Information about the protein quaternary structure and interaction(s) with other proteins or protein complexes\",\n    \"Binary Interactions\": [“List of binary protein-protein interactions with other proteins],\n    \"Biological Process\": [\"List of biological processes the gene is involved in\"],\n    \"Molecular Function\": [\"List of the molecular functions the gene performs\"],\n    \"Cellular Component\": [\"List of cellular components the gene is associated with\"]\n  },\n  \"GENE B\": {….},\n   …\n  GENE C: {….}\n}\n\n[OUTPUT FORMAT]\n{\n'hypotesis': 'Write a consise hypotesis',\n'justification': 'Write a consise justification',\n'likelihood': 'Likelihood between 0 and 100'\n}"


tcga_project = 'TCGA-LGG'
cancer_type = 'competing'
file_path = f'results/top_hits/{tcga_project}_{cancer_type}.csv'
df = pd.read_csv(file_path)

explanations = {}


for gene_a, gene_b in list(zip(df['feature1'], df['feature2']))[:10]:
    print('Started:', f'{gene_a} ({gene_a_id})', f'{gene_b} ({gene_b_id})')
    gene_a_id = symbol_to_gene_id(gene_a)
    gene_b_id = symbol_to_gene_id(gene_b)

    gene_a_data = get_uniprot_data(gene_id_to_uniprot_id[gene_a_id])
    gene_b_data = get_uniprot_data(gene_id_to_uniprot_id[gene_b_id])

    biogrid_common_neighbors = []
    try:
        biogrid_common_neighbors = list(nx.common_neighbors(biogrid_graph, gene_a, gene_b))
    except:
        pass

    biogrid_context = {}
    for gene in biogrid_common_neighbors[:20]:
        gene_id = symbol_to_gene_id(gene)
        if gene_id is None:
            continue
        biogrid_context[gene] = get_uniprot_data(gene_id_to_uniprot_id[gene_id])

    
    string_common_neighbors = []
    try:
        string_common_neighbors = list(nx.common_neighbors(string_graph, gene_a, gene_b))
    except:
        pass

    string_context = {}
    for gene in string_common_neighbors[:20]:
        gene_id = symbol_to_gene_id(gene)
        if gene_id is None:
            continue

        string_context[gene] = get_uniprot_data(gene_id_to_uniprot_id[gene_id])
            

    binary_interactions_context = {}
    uniprot_interactions = set(gene_a_data['Binary Interactions'][:20]).union(gene_b_data['Binary Interactions'][:20])
    for gene in uniprot_interactions:
        gene_id = symbol_to_gene_id(gene)
        if gene_id is None:
            continue

        binary_interactions_context[gene] = get_uniprot_data(gene_id_to_uniprot_id[gene_id])

    context = {gene_a: gene_a_data, gene_b: gene_b_data}
    context.update(biogrid_context)
    context.update(string_context)
    context.update(binary_interactions_context)
    prompt = f'{json.dumps(context)}\n\nExplain the interaction between gene {gene_a} and gene {gene_b}:'

    enc = tiktoken.encoding_for_model("gpt-4-turbo-preview")
    print(f'Tokens sent: {len(enc.encode(prompt))},', f'Biogid genes {len(biogrid_context)},',f'String genes {len(string_context)},', f'Binary interactions {len(binary_interactions_context)}, Total {len(context)}\n')

    response = make_request_to_llm(
        model="gpt-4-turbo-preview",
        # model="gpt-3.5-turbo-1106",
        response_format={"type": "json_object"},
        messages=[
            {
                "role": "system",
                "content": system_prompt % (gene_a, gene_b, gene_a, gene_b, gene_a, gene_b)
            },
            {
                "role": "user",
                "content": prompt
            },
        ],
        temperature=1,
        max_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    explanations[f'{gene_a}-{gene_b}'] = json.loads(response.choices[0].message.content)
    time.sleep(1)


with open(f"explanations/{tcga_project}_{cancer_type}.json", "w") as f:
    json.dump(explanations, f, indent=4)
