In [None]:
import pandas as pd
import numpy as np
import json


In [None]:
df = pd.read_csv("data/rummagenexrummageo.csv")
df

In [None]:
df = df.drop_duplicates(subset='pmc_id', keep='first')
df = df.sort_values(by=["p-value", "odds"], ascending=[True, False])
df.index = np.arange(1, len(df)+1)
df = df.head(1000)
df

In [None]:
rummagene = pd.read_csv("data/rummagene.csv")
rummagene_dict = {row_inner["identifier"]: row_inner["genes"] for index_inner, row_inner in rummagene.iterrows() if row_inner["identifier"] in df["rummagene"].tolist()}
rummageo_comb = pd.concat([ pd.read_csv("data/rummageo_human.csv"), pd.read_csv("data/rummageo_mouse.csv")],  ignore_index=True)
rummageo_dict = {row_inner["identifier"]: row_inner["genes"] for index_inner, row_inner in rummageo_comb.iterrows() if row_inner["identifier"] in df["rummageo"].tolist()}
df["rummagene_genes"] =  df["rummagene"].map(rummagene_dict)
df["rummageo_genes"] =  df["rummageo"].map(rummageo_dict)
df

In [None]:
df.to_csv("data/rummagenexrummageo_1k.csv", index=False)


In [None]:
del rummageo_comb
del rummagene
df = pd.read_csv("data/rummagenexrummageo_1k.csv")

df


In [None]:
df["geo_gsei"] = df["rummageo"].str.split("-").str[0]
df_rummagene_map = {row["rummagene"]: [row["rummagene-desc"], row["pmc_id"]] for _, row in df.iterrows()}
df_rummageo_map = {row["rummageo"]: [row["geo_gse"], row["geo_gsei"], row["species"]] for _, row in df.iterrows()}


In [None]:
with open("../data/gse_results.json" ,"r") as f:
        gse_info = json.load(f)
with open("../data/title_abs.json" ,"r") as f:
        pmc_sum = json.load(f)
with open("../data/human-gse-processed-meta.json") as f:
        gse_human_info = json.load(f)
with open("../data/mouse-gse-processed-meta.json") as f:
        gse_mouse_info = json.load(f)

In [None]:
import re

def extract_after_extension(termid):
    pattern = r'^(.+?\.\w+)-+(.*)$'
    
    match = re.match(pattern, termid)
    
    if match:
        return match.group(2)
    else:
        return None


In [None]:
def pmc_again(pmcids):
    pmc_dict = {}
    for pm in pmcids:
        pmc_dict[pm] = (pmc_sum[pmcids[pm][1]]["title"], pmc_sum[pmcids[pm][1]]["abstract"], extract_after_extension(pm), pmcids[pm][0])

    return pmc_dict


fin_pmc = pmc_again(df_rummagene_map)
fin_pmc


In [None]:
def gse_again(gseids):
    gse_dict = {}
    for gse in gseids:
        if gseids[gse][2] == "human":
            dt = gse_human_info
        else:
            dt = gse_mouse_info
        title = gse_info[gseids[gse][0]]["title"] 
        summary = gse_info[gseids[gse][0]]["summary"][0]
        cond1= dt[gseids[gse][1]]["titles"][gse.split("-")[1]] 
        cond2 = dt[gseids[gse][1]]["titles"][gse.split("-")[3]]
        dir = gse.split("-")[4].split(" ")[0]
        gse_dict[gse] = (title, summary, cond1,cond2, dir)
    return gse_dict

fin_gse = gse_again(df_rummageo_map)
fin_gse


In [None]:
import requests
import time

def get_enrichr_terms(genes, desc):
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/'
    endpoint = 'addList'
    user_list_id = None
    enrichr_libraries = ['WikiPathway_2023_Human', 'GWAS_Catalog_2023', 'GO_Biological_Process_2023', 'MGI_Mammalian_Phenotype_Level_4_2024']
    
    try:
        gen = genes.split(";")
        genes_string = '\n'.join(gen).replace("'", '')
        response = requests.post(
            ENRICHR_URL + endpoint,
            files={'list': (None, genes_string), 'description': (None, desc)}
        )
        response.raise_for_status()
        data = response.json()
        user_list_id = data['userListId']
        short_id = data['shortId']
        
    except requests.exceptions.RequestException as error:
        print(f"Error: {error}")
        return

    enriched_terms = {}
    enrichr_stats = {}
    enrichr_link = f'{ENRICHR_URL}view?userListId={user_list_id}'
    enrichr_page_link = f"https://maayanlab.cloud/Enrichr/enrich?dataset={short_id}"
    for enrichr_library in enrichr_libraries:
        query_string = f'enrich?userListId={user_list_id}&backgroundType={enrichr_library}'
        try:
            response = requests.get(ENRICHR_URL + query_string, headers={'Accept': 'application/json'})
            response.raise_for_status()
            data = response.json()

            enriched_terms[enrichr_library] = []
            for term in data[enrichr_library][:3]:  # Limit to top 3 results
                term_name = term[1]
                enriched_terms[enrichr_library].append(term_name)
                enrichr_stats[term_name] = term
                enrichr_stats[term_name].append(enrichr_library)

            time.sleep(0.5)  # Delay between requests

        except requests.exceptions.RequestException as error:
            print(f"Error: {error}")
            return

    return enriched_terms, enrichr_stats, enrichr_link, enrichr_page_link


## OPENAI

In [None]:
import requests
import os
import json
import matplotlib.pyplot as plt
import numpy as np


def fetch_hypothesis(pm_title, user_desc: str, desct, desc, gse_title, gse_summary: str, cond1, cond2, species,  term2: str, enriched_terms: dict,  enriched_termsi: dict, enriched_termseo: dict,ova) -> str:
    system_prompt = (
        "You are an AI hypothesis generator for RummagenexRummaGEO (gene sets from crossing Rummagene sets with RummaGEO sets). "
        "You should act as a biologist in hypothesizing why a high overlap may exist between a Rummagene set (which are sets from PubMed papers) and a RummaGEO set (automatically generated signatures from Gene Expression Omnibus)."
    )

    enriched_terms_string = ""
    for library, terms in enriched_terms.items():
        enriched_terms_string += f"{library}: {', '.join(terms)}\n"

    enriched_termseo_string = ""
    for library, terms in enriched_termseo.items():
        enriched_terms_string += f"{library}: {', '.join(terms)}\n"
    
    enriched_termsi_string = ""
    for library, terms in enriched_termsi.items():
        enriched_terms_string += f"{library}: {', '.join(terms)}\n"

    # Generate the hypothesis
    prompt = (
    f"Here are two gene sets that highly overlap. The first is from a a gene set automatically computed between two conditions in a study from the Gene Expression Omnibus (GEO)."
    f"The second is a Rummagene set, which comes from a supplementary table in a published biomedical paper."
    f"Based on the term name (formatted as condition 1 vs. condition 2) and the abstract of the GEO gene set, "
    f"as well as the abstract of the Rummagene gene set, hypothesize why these two gene sets have significant overlap. "
    f"In your hypothesis, summarize the RummaGEO gene set and describe the Rummagene gene set. "
    f"You will also be provided with enrichment results from the Enrichr database. These results show "
    f"significantly overlapping functional terms from the overlapping genes of the two sets, as well as the terms from the source genes. "
    f"For each enrichment term you mention, include it exactly as given (e.g., Complement And Coagulation Cascades WP558 should appear "
    f"as Complement And Coagulation Cascades WP558, not as Complement And Coagulation Cascades). Do not use quotes around enrichment term names. "
    f"\n\nGene set term 1 from RummaGEO:\n"
    f'"up" or "dn" in this {term2} name indicates whether the genes were upregulated or downregulated in {cond1} vs. {cond2} conditions '
    f"in the signature for species {species}. Make sure to include this detail in your hypothesis.\n"
    f"Title of study for gene set term 1: {gse_title}\n"
    f"Summary of study for gene set term 1: {gse_summary}\n"
    f"Please mention these details: The enriched terms from the RummaGEO genes are: {enriched_termseo_string}\n\n"
    f"Gene set term 2 from Rummagene:\n"
    f"Title of paper for gene set term 2: {pm_title}\n"
    f"Abstract of paper for gene set term 2: {user_desc}\n"
    f"Please mention these details: The enriched terms from the Rummagene genes are: {enriched_termsi_string}\n"
    f"{desct} is the name of the table from the paper where this gene set originates, and {desc} is its description. Include this detail if relevant.\n\n"
    f"There are {ova} overlapping genes.\n"
    f"Enriched Terms from overlapping genes of the two sets:\n"
    f"{enriched_terms_string}\n\n"
    f"Please compare the enriched terms from the overlapping genes ({enriched_terms_string}) to the enriched terms from the RummaGEO set "
    f"({enriched_termseo_string}) and the Rummagene set ({enriched_termsi_string}). "
    f"Use this comparison to support your hypothesis about why these gene sets overlap significantly, considering shared pathways or conditions."
)

    # Send the request to the OpenAI API
    response = requests.post(
        "https://api.openai.com/v1/chat/completions",
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
        },
        json={
            "model": "gpt-4o",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            "max_tokens": 1000,
            "temperature": 0
        }
    )

    # Parse the response from the API
    response_json = response.json()
    hypothesis = response_json['choices'][0]['message']['content']
    
    hyp = response_json['choices'][0]['message']['content']
    title_prompt = (
        f"Based on the following hypothesis, generate a concise and descriptive title:\n\n{hyp}"
    )

    title_response = requests.post(
        "https://api.openai.com/v1/chat/completions",
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
        },
        json={
            "model": "gpt-4o",
            "messages": [{"role": "user", "content": title_prompt}],
            "max_tokens": 50,
            "temperature": 0
        }
    )

    title_json = title_response.json()
    title = title_json['choices'][0]['message']['content'].strip()

    return hypothesis, title

## GEMINI

In [None]:
dfe = df.head(1000)

In [None]:
if not os.path.exists('../data/data_hyp.json'):
        with open('../data/data_hyp.json' ,"r") as f:
                hypotheses  = json.load(f)
else:
        hypotheses = {}

if not os.path.exists('../data/data_enrichr2.json'):
        with open('data/data_enrichr2.json',"r") as f:
                enrichr_stuff= json.load(f)
else:
        enrichr_stuff = {}


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import io
import textwrap
from collections import defaultdict

def generate_bar(enriched_stats, enriched_statseo, enriched_statsi, show=False):
    combined_data = defaultdict(lambda: {"p_values": [0, 0, 0], "sources": []})
    
    # Combine terms from all dictionaries
    for idx, (source, data, color) in enumerate([
        ("Enriched Stats", enriched_stats, 'blue'),
        ("Enriched Statsi", enriched_statsi, 'black'),
        ("Enriched Statseo", enriched_statseo, 'red')
    ]):
        for term, values in data.items():
            combined_data[term]["p_values"][idx] = -np.log10(float(values[2]))  # Store p-value for each source
            combined_data[term]["sources"].append(data[term][9])
    
    # Consolidate terms
    all_terms = []
    all_p_values = []
    for term, info in combined_data.items():
        all_terms.append(f"{term} ({', '.join(set(info['sources']))})")
        all_p_values.append(info["p_values"])
    
    # Sort by the total p-value (sum of stacked p-values) in descending order
    total_p_values = [sum(pvals) for pvals in all_p_values]
    sorted_indices = np.argsort(total_p_values)[::-1]
    sorted_terms = [all_terms[i] for i in sorted_indices]
    sorted_p_values = [all_p_values[i] for i in sorted_indices]

    # Wrap long labels
    wrapped_labels = ['\n'.join(textwrap.wrap(label, width=70)) for label in sorted_terms]

   
    plt.figure(figsize=(12, 8))
    bar_width = 0.5  
    y_pos = range(len(sorted_terms))
    
    # Stack the p-values for each source
    plt.barh(y_pos, [p[0] for p in sorted_p_values], height=bar_width, color='blue', label='Overlap')
    plt.barh(y_pos, [p[1] for p in sorted_p_values], height=bar_width, left=[p[0] for p in sorted_p_values], color='orange', label='Rummagene')
    plt.barh(y_pos, [p[2] for p in sorted_p_values], height=bar_width, left=[p[0] + p[1] for p in sorted_p_values], color='red', label='RummaGEO')

    # Adjust font sizes
    plt.yticks(y_pos, wrapped_labels, fontsize=11)  
    plt.xlabel('-log10(P-value)', fontsize=11)      
    plt.title('Enriched Terms (Stacked by Source)', fontsize=12, fontweight="bold")  
    plt.legend(
        loc='center left',
        bbox_to_anchor=(1.05, 0.5),  
        fontsize=10,          
        markerscale=0.5,        
        borderpad=0.5,          
        frameon=True         
    )


    plt.gca().invert_yaxis()  
    plt.tight_layout()

    buffer = io.BytesIO()
    plt.savefig(buffer,bbox_inches="tight", dpi=200)

    if show:
        plt.show()
    return buffer


In [None]:
import requests
import time

def get_enrichr_tuple(tup):
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/'
    ele = tup.split("?")[1]
    user_list_id = ele.split("=")[1]
    enrichr_libraries = ['WikiPathway_2023_Human', 'GWAS_Catalog_2023', 'GO_Biological_Process_2023', 'MGI_Mammalian_Phenotype_Level_4_2024']
    enrichr_stats = {}
    for enrichr_library in enrichr_libraries:
        query_string = f'enrich?userListId={user_list_id}&backgroundType={enrichr_library}'
        try:
            response = requests.get(ENRICHR_URL + query_string, headers={'Accept': 'application/json'})
            response.raise_for_status()
            data = response.json()

            for term in data[enrichr_library][:3]:  # Limit to top 3 results
                term_name = term[1]
                enrichr_stats[term_name] = term
                enrichr_stats[term_name].append(enrichr_library)

            time.sleep(10)  # Delay between requests

        except requests.exceptions.RequestException as error:
            print(f"Error: {error}")
            return

    return enrichr_stats


In [None]:
from fpdf import FPDF
from PIL import Image
from fpdf.enums import XPos, YPos


def write_markdown_hypotheses_to_pdf(hypotheses, enricher, output_filename):
    """
    Write a list of Markdown-formatted hypotheses to a PDF, ensuring each fits on one page.
    """
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.set_left_margin(15)
    pdf.set_right_margin(15)
    #need to download fonts
    pdf.add_font('Arial', '', 'Arial-Unicode-Regular.TTF')
    pdf.add_font('Arial', 'B', 'Arial-Unicode-Bold.TTF')
    pdf.add_font('Arial', 'I', 'Arial-Unicode-Italic.TTF')
    pdf.set_font('Arial', size=12)
    for idx, (name, input) in enumerate(hypotheses.items()):
            title = input["title"]
            hypothesis = input["hypothesis"]
            links = enricher[name]["enrich view"]
            # Generate bar image 
            enrich = links[0][1]
            pval = input["pval"]
            gene_size = input["Sizes"][2]
            geo_size = input["Sizes"][1]
            ova_size = input["Sizes"][0]
            pdf.add_page()
            pdf.set_font_size(size=12)
            pdf.multi_cell(0, 5, f"**Hypothesis {idx + 1}: {title}**", new_x=XPos.LMARGIN, new_y=YPos.NEXT, align='C', markdown=True)
            pdf.ln(1)
            plain_text = hypothesis
            parts = plain_text.split("\n\n")
            para_count = 0
            gene, geo = name.split(";")
            pmcid = gene.split("-")[0]
            geoid = geo.split("-")[0]
            geoid = geoid.split(",")[0]
            value = str(pval)
            if pval == 0:
                value = "<1e-324"
            pdf.set_font_size(size=11)
            prefix_text = "**Rummagene set:** "
            hyperlink_text = f"{gene}"
            pdf.cell(0, 5, prefix_text, markdown=True)
            pdf.set_text_color(0, 0, 255)  # Blue
            pdf.set_font(style="U")  # Underline
            pdf.set_x(50)
            pdf.multi_cell(0, 5, hyperlink_text, new_x=XPos.LMARGIN, new_y=YPos.NEXT, link=f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}")
            pdf.set_text_color(0, 0, 0)  
            prefix_text = "**RummaGEO set:**"
            hyperlink_text = f"{geo}"
            pdf.set_font('Arial')
            pdf.cell(0, 5, prefix_text, markdown=True)
            pdf.set_x(50)
            pdf.set_text_color(0, 0, 255)  # Blue
            pdf.set_font(style="U")  # Underline
            pdf.cell(0, 5, hyperlink_text, new_x=XPos.LMARGIN, new_y=YPos.NEXT, link=f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={geoid}")
            pdf.set_text_color(0, 0, 0) 
            #sizes
            pdf.set_font('Arial')
            pdf.multi_cell(0, 5, f"**Rummagene set size**: {int(gene_size)}; **RummaGEO set size**: {int(geo_size)}" ,new_x=XPos.LMARGIN, new_y=YPos.NEXT, markdown=True)
            pdf.cell(0, 5, f"**Overlap set size**: {int(ova_size)};  **p-value**: {value};  ", markdown=True)
            #enrichr
            pdf.set_x(92)
            hyperlink_text = "Enrichr Link"
            pdf.set_text_color(0, 0, 255)  # Blue
            pdf.set_font(style="U")  # Underline
            pdf.cell(0, 5, hyperlink_text, new_x=XPos.LMARGIN, new_y=YPos.NEXT, link=f"{enrich}")
            pdf.set_text_color(0, 0, 0) 
            pdf.ln(1)
            pdf.set_text_color(0, 0, 0)
            pdf.set_font('Arial', size=11)
            for ele in parts:
                ele.replace("—", "-")
                pdf.multi_cell(0, 5, f"{ele}", new_x=XPos.LMARGIN, new_y=YPos.NEXT, align="J", markdown=True)
                pdf.ln(2)
                para_count += 1
                if para_count == 3:
                    bar_data = [get_enrichr_tuple(link[0]) for link in links]
                    base64image = generate_bar(
                        enriched_stats=bar_data[0], 
                        enriched_statseo=bar_data[1], 
                        enriched_statsi=bar_data[2], 
                        show=False
                    )
                    pdf.image(base64image, w=pdf.epw)  
                    base64image.close()
                    pdf.ln(1)
                    pdf.set_font('Arial',size=9)
                    legend = "**Figure 1.** Stacked Bar Plot depicting the distribution of significantly enriched terms among the Rummagene set(yellow), the RummaGEO set(red) and the overlapping set(blue). The top 3 enriched terms from these Enrichr Libraries were used for each set: WikiPathway_2023_Human, GWAS_Catalog_2023, GO_Biological_Process_2023 and MGI_Mammalian_Phenotype_Level_4_2024"
                    pdf.multi_cell(0, 5, legend, new_x=XPos.LMARGIN, new_y=YPos.NEXT, align="L", markdown=True)
                    pdf.set_font('Arial', size=11)
                    pdf.ln(1)



    pdf.output(output_filename)
write_markdown_hypotheses_to_pdf(hypotheses,enrichr_stuff, "data/Hypotheses.pdf")