In [None]:
import re
import pathlib
from pprint import pprint
import os

from pathlib import Path
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import pyLDAvis
import pyLDAvis.gensim

import gensim
import gensim.corpora as corpora

from sklearn.manifold import TSNE
from bokeh.plotting import figure

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer

In [None]:
parent_dir = os.path.abspath('../')
parent_dir

In [None]:
from common import data_dir 
import json

In [None]:

with open(f"{parent_dir}data_enrichr2.json", "r") as c:
    enrichr = json.load(c)
df = pd.DataFrame(list([(name, stuff["enrich view"][0][0], stuff["enrich view"][1][0], stuff["enrich view"][2][0]) for name, stuff in enrichr.items()]
), columns=["Name", "Enrichr View Link", "Enrichr RummaGEO Link", "Enrichr Rummagene Link"])

df[["rummagene", "rummageo"]] =df["Name"].str.split(";", expand=True)

df


In [None]:
dfe = pd.read_csv(f"{parent_dir}/rummagenexrummageo.csv")
df = df.head(1000000)
df = pd.merge(df, dfe, on=["rummagene", "rummageo"])
df

In [None]:
import requests
import time

def get_enrichr_terms(tup):
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/'
    ele = tup.split("?")[1]
    user_list_id = ele.split("=")[1]
    enrichr_libraries = ['WikiPathway_2023_Human', 'GWAS_Catalog_2023', 'GO_Biological_Process_2023', 'MGI_Mammalian_Phenotype_Level_4_2024']
    enriched_terms = {}
    enrichr_stats = {}    
    for enrichr_library in enrichr_libraries:
        query_string = f'enrich?userListId={user_list_id}&backgroundType={enrichr_library}'
        try:
            response = requests.get(ENRICHR_URL + query_string, headers={'Accept': 'application/json'})
            response.raise_for_status()
            data = response.json()

            enriched_terms[enrichr_library] = []
            for term in data[enrichr_library][:3]:  # Limit to top 3 results
                term_name = term[1]
                enriched_terms[enrichr_library].append(f"{term_name} ({enrichr_library})")
                enrichr_stats[term_name] = term
                enrichr_stats[term_name].append(enrichr_library)
            time.sleep(2)  # Delay between requests

        except requests.exceptions.RequestException as error:
            print(f"Error: {error}")
            return
        




    return enriched_terms, enrichr_stats


In [None]:
import time

results = []
stats = []
indices = []


In [None]:

for index, row in df.iterrows():
    if index not in indices:
        enrichment_overlap= get_enrichr_terms(row["Enrichr View Link"])
        enrichment_geo= get_enrichr_terms(row["Enrichr RummaGEO Link"])

        enrichment_gene= get_enrichr_terms(row["Enrichr Rummagene Link"])


        if enrichment_overlap and enrichment_geo and enrichment_gene:
            terms, stats = enrichment_overlap
            termseo, statseo = enrichment_geo
            termsgene, statsgene = enrichment_gene


            results.append({
                "GeneSet": row["Name"],
                "Overlap Terms": ";".join([item for sublist in terms.values() for item in sublist]),
                "Rummagene Terms": ";".join([item for sublist in termseo.values() for item in sublist]),
                "RummaGEO Terms": ";".join([item for sublist in termsgene.values() for item in sublist]),

            })
            indices.append(index)
        time.sleep(2) 

In [None]:
results[0]

In [None]:
results_df = pd.DataFrame(results)
results_df

In [None]:
results_df.to_csv("data/enriched_terms_overall.csv", index=False)

In [None]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

# Split strings by ';' and flatten the list
all_phrases = [phrase for row in results_df["Overlap Terms"] for phrase in row.split(";")]
all_phraseseo = [phrase for row in results_df["RummaGEO Terms"] for phrase in row.split(";")]
all_phrasesgene = [phrase for row in results_df["Rummagene Terms"] for phrase in row.split(";")]


# Count the frequencies
phrase_counts = Counter(all_phrases)
phrase_countseo = Counter(all_phraseseo)
phrase_countsgene = Counter(all_phrasesgene)


# Get the 10 most common phrases
most_common_phrases = phrase_counts.most_common(10)
most_common_phraseseo = phrase_countseo.most_common(10)
most_common_phrasesgene = phrase_countsgene.most_common(10)


# Prepare data for plotting
phrases, counts = zip(*most_common_phrases)

# Create the horizontal bar chart
plt.figure(figsize=(6, 10))
plt.barh(phrases, counts, color="black")
plt.xlabel("Frequency")
# plt.ylabel("Phrases")
plt.title("Most Common Enriched Terms in the top 1000 Hypotheses")
plt.gca().invert_yaxis()  # Invert y-axis for better readability
plt.show()
