In [None]:
from crossing import preprop_GMTs, cross_GMTs, cross_GMTs_rummageo_first
import pandas as pd
import time
from tqdm import tqdm 
import os
import numpy as np

In [None]:
preprop_GMTs("data/latest.gmt", "data/rummagene")
preprop_GMTs("data/human-geo-auto.gmt.gz", "data/rummageo_human")
preprop_GMTs("data/mouse-geo-auto.gmt.gz", "data/rummageo_mouse")


In [None]:
rummagene = pd.read_csv("data/rummagene.csv")
rummageo_h = pd.read_csv("data/rummageo_human.csv")
rummageo_m = pd.read_csv("data/rummageo_mouse.csv")
rummageo_h["species"] = "human"
rummageo_m["species"] = "mouse"
rummageo_comb = pd.concat([rummageo_h, rummageo_m],  ignore_index=True)
rummageo_comb.to_csv("rummageo_comb.csv", index=False)
rummageo_comb.to_csv("data/rummageo_comb.csv", index=False)
rummageo_comb = pd.read_csv("data/rummageo_comb.csv")


In [None]:
def rummagene_exists(df, file_path, column="identifier"):
    "takes df and removes from its given column terms already present in file_path to avoid running crossing that is already done"
    if not os.path.isfile(file_path):
        open(file_path, 'w').close()  
    
    with open(file_path, 'r') as file:
        existing_elements = {line.strip() for line in file}
    
    filtered_df = df[~df[column].isin(existing_elements)]
    return filtered_df


def rummageo_exists(df, file_path, column="identifier"):
    "takes df and removes from its given column terms already present in file_path to avoid running crossing that is already done"
    if not os.path.isfile(file_path):
        open(file_path, 'w').close()  
    
    with open(file_path, 'r') as file:
        existing_elements = {line.strip() for line in file}
    
    filtered_df = df[~df[column].isin(existing_elements)]
    return filtered_df

In [None]:
def cross_files(rumma_geo, output_prefix, record_file):
    batch_size = 3000  
    num_batches = (len(rummagene) // batch_size) + 1
    rummageo_dict = {row_inner["identifier"]: set(row_inner["genes"].split(";")) for index_inner, row_inner in rumma_geo.iterrows()}

    for i in range(num_batches):
        start = time.time()
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        batch_n = rummagene[start_idx:end_idx]
        main_dict = {} #maps rummagene name to rummageo genes
        desc_dict ={} #maps rummagene name to rummagene table description
        batch = rummagene_exists(batch_n, record_file)
        if len(batch) != 0:
            print(f"This is {len(batch)} rows")
            for index, row in tqdm(batch.iterrows(), total=batch.shape[0]):
                main_dict[row["identifier"]] = set(row['genes'].split(";"))
                desc_dict[row['identifier']] = row["desc"]
            if  len(main_dict) > 0:
                cross_GMTs(main_dict,desc_dict, rummageo_dict, output_prefix, record_file)

        else:
            print("O batch")
        elapsed_time = time.time() - start
        print(f"batch_done:  {int(elapsed_time // 3600)}hr, {int((elapsed_time % 3600) // 60)}min, {int(elapsed_time % 60)}sec")
        print(i)
        print(num_batches)

        
cross_files(rummageo_h,"data/rummageogene_human", "data/rummagene_h.txt")
cross_files(rummageo_m,"data/rummageogene_mouse", "data/rummagene_m.txt")


In [None]:
def cross_files_rummageo_first(rummagene, output_prefix, record_file):
    batch_size = 800
    num_batches = (len(rummageo_comb) // batch_size) + 1
    print(num_batches)
    desc_dict ={} #maps rummagene name to rummagene table description
    rummagene_dict = {row_inner["identifier"]: set(row_inner["genes"].split(";")) for index_inner, row_inner in rummagene.iterrows()}

    for index, row in tqdm(rummagene.iterrows(), total=rummagene.shape[0]):
            desc_dict[row['identifier']] = row["desc"]

    for i in range(358, num_batches):
        start = time.time()
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        batch_n = rummageo_comb[start_idx:end_idx]
        main_dict = {} #maps rummageo name to rummagene genes
        batch = rummageo_exists(batch_n, record_file)
        if len(batch) != 0:
            print(f"This is {len(batch)} rows")
            for index, row in tqdm(batch.iterrows(), total=batch.shape[0]):
                main_dict[row["identifier"]] = set(row['genes'].split(";"))
            if  len(main_dict) > 0:
                cross_GMTs_rummageo_first(main_dict,desc_dict, rummagene_dict, output_prefix, record_file)
        else:
            print("O batch")
        elapsed_time = time.time() - start
        print(f"batch_done:  {int(elapsed_time // 3600)}hr, {int((elapsed_time % 3600) // 60)}min, {int(elapsed_time % 60)}sec")
        print(i)
        print(num_batches)
        
cross_files_rummageo_first(rummagene,"data/rummageogene_rummageo_first", "data/rummageo_comb.txt")

In [None]:
human = pd.read_csv("data/rummageogene_human.csv")
human["species"] = "human"
mouse = pd.read_csv("data/rummageogene_mouse.csv")
mouse["species"] = "mouse"
dfe = pd.read_csv("data/rummageogene_rummageo_first.csv")
df = pd.concat([human, mouse, dfe], ignore_index=False)
df = df.drop_duplicates(subset=["rummagene", "rummageo"])
df = df.sort_values(by=["p-value", "odds"], ascending=[True, False])
df.index = np.arange(1, len(df.index)+1)
df

In [None]:
df.to_csv('data/rummagenexrummageo.csv', index=False)
