In [None]:
import pandas as pd
import numpy as np
import json
df = pd.read_csv("./data/rummagenexrummageo.csv")
df = df.drop_duplicates(subset='pmc_id', keep='first')
df = df.sort_values(by=["p-value", "odds"], ascending=[True, False])
df.index = np.arange(1, len(df)+1)
df = df.head(1000)
df

In [None]:
df_rummagene_map = {row["rummagene"]: [row["rummagene-desc"], row["pmc_id"]] for _, row in df.iterrows()}
df_rummageo_map = {row["rummageo"]: [row["geo_gse"], row["geo_gsei"], row["species"]] for _, row in df.iterrows()}


In [None]:
with open("../data/gse_results.json" ,"r") as f:
        gse_info = json.load(f)
with open("../data/title_abs.json" ,"r") as f:
        pmc_sum = json.load(f)
with open("../data/human-gse-processed-meta.json") as f:
        gse_human_info = json.load(f)
with open("../data/mouse-gse-processed-meta.json") as f:
        gse_mouse_info = json.load(f)

In [None]:
import re

def extract_after_extension(termid):
    pattern = r'^(.+?\.\w+)-+(.*)$'
    
    match = re.match(pattern, termid)
    
    if match:
        return match.group(2)
    else:
        return None


In [None]:
def pmc_again(pmcids):
    pmc_dict = {}
    for pm in pmcids:
        pmc_dict[pm] = (pmc_sum[pmcids[pm][1]]["title"], pmc_sum[pmcids[pm][1]]["abstract"], extract_after_extension(pm), pmcids[pm][0])

    return pmc_dict


fin_pmc = pmc_again(df_rummagene_map)
fin_pmc


In [None]:
def gse_again(gseids):
    gse_dict = {}
    for gse in gseids:
        if gseids[gse][2] == "human":
            dt = gse_human_info
        else:
            dt = gse_mouse_info
        title = gse_info[gseids[gse][0]]["title"] 
        summary = gse_info[gseids[gse][0]]["summary"][0]
        cond1= dt[gseids[gse][1]]["titles"][gse.split("-")[1]] 
        cond2 = dt[gseids[gse][1]]["titles"][gse.split("-")[3]]
        dir = gse.split("-")[4].split(" ")[0]
        gse_dict[gse] = (title, summary, cond1,cond2, dir)
    return gse_dict

fin_gse = gse_again(df_rummageo_map)
fin_gse


In [None]:
import requests
import time

def get_enrichr_terms(genes):
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/'
    endpoint = 'addList'
    user_list_id = None
    enrichr_libraries = ['WikiPathway_2023_Human', 'GWAS_Catalog_2023', 
                         'GO_Biological_Process_2023', 'MGI_Mammalian_Phenotype_Level_4_2021']
    
    
    try:
        gen = genes.split(";")
        genes_string = '\n'.join(gen).replace("'", '')
        response = requests.post(
            ENRICHR_URL + endpoint,
            files={'list': (None, genes_string), 'description': (None, '')}
            # headers={'Content-Type': 'multipart/form-data'}
        )
        response.raise_for_status()
        data = response.json()
        user_list_id = data['userListId']
    except requests.exceptions.RequestException as error:
        print(f"Error: {error}")
        return

    enriched_terms = {}
    enrichr_stats = {}

    for enrichr_library in enrichr_libraries:
        query_string = f'enrich?userListId={user_list_id}&backgroundType={enrichr_library}'
        try:
            response = requests.get(ENRICHR_URL + query_string, headers={'Accept': 'application/json'})
            response.raise_for_status()
            data = response.json()

            enriched_terms[enrichr_library] = []
            for term in data[enrichr_library][:3]:  # Limit to top 3 results
                term_name = term[1]
                enriched_terms[enrichr_library].append(term_name)
                enrichr_stats[term_name] = term
                enrichr_stats[term_name].append(enrichr_library)

            time.sleep(0.5)  # Delay between requests

        except requests.exceptions.RequestException as error:
            print(f"Error: {error}")
            return

    return enriched_terms, enrichr_stats


# OPENAI

In [None]:
import requests
import os
import json

def fetch_hypothesis(pm_title, user_desc: str, desct, desc, gse_title, gse_summary: str, cond1, cond2, species, term1: str,term2: str, enriched_terms: dict, enriched_stats: dict) -> str:
    # Define the system prompt
    system_prompt = (
        "You are an AI hypothesis generator for RummagenexRummaGEO (gene sets from crossing Rummagene sets with RummaGEO sets). "
        "You should act as a biologist in hypothesizing why a high overlap may exist between a Rummagene set (which are sets from PubMed papers) and a RummaGEO set (automatically generated signatures from Gene Expression Omnibus)."
    )

    # Build the enriched terms string
    enriched_terms_string = ""
    for library, terms in enriched_terms.items():
        enriched_terms_string += f"{library}: {', '.join(terms)}\n"

    # Define the main prompt to be sent to the AI
    prompt = (
        f"Here are two gene sets that highly overlap. The first is from a Rummagene set. "
        f"The second is a gene set automatically computed between two conditions in a study from the Gene Expression Omnibus (GEO). "
        f"Based upon the term name (formatted as condition 1 vs. condition 2) and the abstract of the GEO gene set, "
        f"and the abstract of the Rummagene gene set, please hypothesize about why these two gene sets have a significant high overlap. "
        f"You should mention both the summary of the RummaGEO gene set and the description of the Rummagene gene set in your hypothesis. "
        f"You will also be provided with enrichment results from the Enrichr database to help you generate your hypothesis, which shows "
        f"significantly overlapping functional terms from the overlapping genes of the two sets. "
        f"For each enrichment term that appears in your response, the term should appear in the exact form it was given to you "
        f"(do not exclude any words or characters from a term. For example, Complement And Coagulation Cascades WP558 should appear as "
        f"Complement And Coagulation Cascades WP558, not Complement And Coagulation Cascades). Also, please don't use quotes around the enriched term names. "
        f"Gene set term 1 from RummaGEO: {term2}\n"
        f'"up" or "dn" in this{term2} name indicates if the genes were upregulated or downregulated in {cond1} vs {cond2} conditions in the signature for species {species}.Please make sure to include this detail in your hypothesis.\n'
        f"title of study for gene set term 1: {gse_title}\n"
        f"summary of study for gene set term 1: {gse_summary}\n"
        f"Gene set term 2 from Rummagene: {term1}\n"
        f"title of paper for gene set term 2: {pm_title}\n"
        f"abstract of paper for gene set term 2: {user_desc}\n"
        f"{desct}is the name of the table from the paper which the gene set comes from and {desc} is its description. Please include this detail if relevant\n"
        f"Enriched Terms from overlapping genes of the two sets:\n"
        f"{enriched_terms_string}"
    )

    # Send the request to the OpenAI API
    response = requests.post(
        "https://api.openai.com/v1/chat/completions",
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
        },
        json={
            "model": "gpt-4o",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            "max_tokens": 1000,
            "temperature": 0
        }
    )

    # Parse the response from the API
    response_json = response.json()
    hypothesis = response_json['choices'][0]['message']['content']

    # Replace enriched terms in the hypothesis with detailed text
    for term, stats in enriched_stats.items():
        if term in hypothesis:
            details = (
                f"Term: {term}\tLibrary: {stats[9]}\tRank: {stats[0]}\tP-value: {float(stats[2]):.2e}\tOdds Ratio: {float(stats[3]):.4f}\n"
            )
            hypothesis = hypothesis.replace(term, f"{term} ({details})")

    return hypothesis


# GEMINI

In [1]:
import os
import requests
import json
from google.auth import jwt
from google.auth.transport.requests import Request

def fetch_hypothesis(pm_title, user_desc: str, desct, desc, gse_title, gse_summary: str, cond1, cond2, species, term1: str, term2: str, enriched_terms: dict, enriched_stats: dict) -> str:
    # Define the system prompt
    system_prompt = (
        "You are an AI hypothesis generator for RummagenexRummaGEO (gene sets from crossing Rummagene sets with RummaGEO sets). "
        "You should act as a biologist in hypothesizing why a high overlap may exist between a Rummagene set (which are sets from PubMed papers) and a RummaGEO set (automatically generated signatures from Gene Expression Omnibus)."
    )

    # Build the enriched terms string
    enriched_terms_string = ""
    for library, terms in enriched_terms.items():
        enriched_terms_string += f"{library}: {', '.join(terms)}\n"

    # Define the main prompt to be sent to the AI
    prompt = (
        f"Here are two gene sets that highly overlap. The first is from a Rummagene set. "
        f"The second is a gene set automatically computed between two conditions in a study from the Gene Expression Omnibus (GEO). "
        f"Based upon the term name (formatted as condition 1 vs. condition 2) and the abstract of the GEO gene set, "
        f"and the abstract of the Rummagene gene set, please hypothesize about why these two gene sets have a significant high overlap. "
        f"You should mention both the summary of the RummaGEO gene set and the description of the Rummagene gene set in your hypothesis. "
        f"You will also be provided with enrichment results from the Enrichr database to help you generate your hypothesis, which shows "
        f"significantly overlapping functional terms from the overlapping genes of the two sets. "
        f"For each enrichment term that appears in your response, the term should appear in the exact form it was given to you "
        f"(do not exclude any words or characters from a term. For example, Complement And Coagulation Cascades WP558 should appear as "
        f"Complement And Coagulation Cascades WP558, not Complement And Coagulation Cascades). Also, please don't use quotes around the enriched term names. "
        f"Gene set term 1 from RummaGEO: {term2}\n"
        f'"up" or "dn" in this{term2} name indicates if the genes were upregulated or downregulated in {cond1} vs {cond2} conditions in the signature for species {species}.Please make sure to include this detail in your hypothesis.\n'
        f"title of study for gene set term 1: {gse_title}\n"
        f"summary of study for gene set term 1: {gse_summary}\n"
        f"Gene set term 2 from Rummagene: {term1}\n"
        f"title of paper for gene set term 2: {pm_title}\n"
        f"abstract of paper for gene set term 2: {user_desc}\n"
        f"{desct} is the name of the table from the paper which the gene set comes from and {desc} is its description. Please include this detail if relevant\n"
        f"Enriched Terms from overlapping genes of the two sets:\n"
        f"{enriched_terms_string}"
    )

    # Authenticate with the Gemini API
    credentials = jwt.Credentials.from_service_account_file(
        "path/to/service_account.json",
        audience="https://gemini.googleapis.com",
    )
    auth_request = Request()
    credentials.refresh(auth_request)
    
    # Send the request to the Gemini API
    response = requests.post(
        "https://gemini.googleapis.com/v1/models/gemini-model/complete",
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {credentials.token}",
        },
        json={
            "model": "gemini-model",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            "max_tokens": 1000,
            "temperature": 0
        }
    )

    # Parse the response from the API
    response_json = response.json()
    hypothesis = response_json.get('choices', [{}])[0].get('message', {}).get('content', '')

    # Replace enriched terms in the hypothesis with detailed text
    for term, stats in enriched_stats.items():
        if term in hypothesis:
            details = (
                f"Term: {term}\tLibrary: {stats[9]}\tRank: {stats[0]}\tP-value: {float(stats[2]):.2e}\tOdds Ratio: {float(stats[3]):.4f}\n"
            )
            hypothesis = hypothesis.replace(term, f"{term} ({details})")

    return hypothesis


ModuleNotFoundError: No module named 'google'

In [None]:
tuples_list = list(zip(df["rummagene"], df['rummageo'], df["overlaps"]))
hypotheses = {}


In [None]:
for i, (pmcid, geo_id, ov) in enumerate(tuples_list):
        term = f"{pmcid};{geo_id}"
        if term not in hypotheses: 
            try:
                terms, stats = get_enrichr_terms(ov)
                hypothesis = fetch_hypothesis(pm_title=fin_pmc[pmcid][0], user_desc=fin_pmc[pmcid][1], desct=fin_pmc[pmcid][2], desc=fin_pmc[pmcid][3], gse_title=fin_gse[geo_id][0],gse_summary=fin_gse[geo_id][1], cond1=fin_gse[geo_id][2], cond2=fin_gse[geo_id][3],species=fin_gse[geo_id][4], term1=pmcid, term2=geo_id, enriched_terms=terms, enriched_stats=stats)
                hypotheses[term] = hypothesis
                with open('data_hyp.json', 'w') as json_file:
                    json.dump(hypotheses, json_file, indent=4)
            except:
                print(term)
                print(i)

   
