In [None]:
import pandas as pd
import numpy as np
import pyarrow.feather as feather
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
species_list = ['human', 'mouse']
direction = [('up', lambda df: df.fc > 0), ('down', lambda df: df.fc < 0)]
top_count = 250
dash_regex = r"(?u)\b\w[\w-]*\w\b"  #preserves dashes

In [None]:
for species in species_list:                                               # converting the csv files to feather files
    for dat_type in ['fc', 'pval']:                                        # for quicker access when making gene sets
        df = pd.read_csv(f"all_{species}_{dat_type}.csv", index_col = 0)   # from each signature, index reset due to 
        df.rename_axis('genes', inplace = True)                            # feather files not storing index
        df.reset_index(inplace = True)
        feather.write_feather(df, f"all_{species}_{dat_type}.feather")
        print(species, dat_type, 'done')

In [None]:
def decapitalize(string):
    '''
    decapitalize the first word if it's not an acronym
    '''
    try:
        first, rest = string.split(" ", 1)
        if not bool(re.search("\B\w*[A-Z]\w*", first)): 
            first = first.lower()
        return f"{first} {rest}"
    except:
        return string

def tfidf_shorten_term(idx, vectorizer, term_mat, kept_words = 4):
    '''
    shorten term based on tf-idf score, keeping kept_words words
    '''
    row, column = term_mat.getrow(idx).nonzero()
    terms = np.array([vectorizer.get_feature_names()[c] for c in column])
    if len(terms) <= kept_words: return terms
    tfidf_val = np.array([term_mat[idx, c] for c in column])
    sorted_val_idx = np.argsort(tfidf_val)[::-1][:kept_words]
    return terms[sorted_val_idx]

In [None]:
for species in species_list:
    gmt_tuples = {'up':[], 'down':[]}
    master_df = pd.read_csv(f"all_{species}_score.csv", index_col = 0)
    for n, col in enumerate(master_df.columns):
        # identifying top top_count up and down genes for each signature
        try:
            if n % 500 == 0: print(n, species, 'done')
            title, gse = col.rsplit(" ", 1)
            fc_df = feather.read_feather(f"all_{species}_fc.feather", columns = ["genes", col]).set_index('genes')
            pval_df = feather.read_feather(f"all_{species}_pval.feather", columns = ["genes", col]).set_index('genes')
            comb_df = pd.concat([fc_df, pval_df], axis=1)
            comb_df.columns = ['fc', 'pval']
            for d, d_func in direction:                                                     # subsetting based on fc sign
                top_genes = list(comb_df[d_func].sort_values(by='pval').index[:top_count])  # sorting based on pval magnitude
                gmt_tuples[d].append((gse, title, *top_genes))
        except:
            print(col)
    
    # creating raw [human, mouse] x [up, down] gene set library gmt files
    # row: {gse id}_{signature number}\t{gse title}\tGENE_1\tGENE_2\t...\tGENE_{top_count}
    pd.DataFrame(gmt_tuples['up']).to_csv(f"{species}_up_gene_set_raw.gmt", index = False, header = False, sep = "\t")
    pd.DataFrame(gmt_tuples['down']).to_csv(f"{species}_down_gene_set_raw.gmt", index = False, header = False, sep = "\t")
    print(species, 'done')

In [None]:
for species in species_list:
    print("starting", species)
    gmt = pd.read_csv(f"{species}_up_gene_set_raw.gmt", header = None, usecols = [1], sep = "\t")
    gmt.columns = ["raw_titles"]
    gmt.fillna("", inplace = True) # certain studies had their metadata privatized on GEO so no title
    gmt["titles"] = gmt["raw_titles"].apply(decapitalize)
    gmt_unique = pd.DataFrame(data = gmt["titles"].unique(), columns = ["unique_titles"])
    # list of terms found in each unique title, respecting dashes via regex
    gmt_unique["unique_list"] = gmt_unique["unique_titles"].apply(str.lower).str.findall(dash_regex)
    
    vectorizer = TfidfVectorizer(stop_words = 'english', token_pattern = dash_regex)
    term_mat = vectorizer.fit_transform(gmt_unique["unique_titles"])
    
    print('making raw_key_terms...')
    #list of top tf-idf terms for each unique title, in order of tf-idf score
    gmt_unique["raw_key_terms"] = [tfidf_shorten_term(i, vectorizer, term_mat) for i in gmt_unique.index]
    print('done making raw_key_terms')
    #list of top if-idf terms for each unique title, in order of term appearance in title
    gmt_unique["key_terms_ordered"] = [sorted(gmt_unique["raw_key_terms"][i], 
                                              key=lambda x: gmt_unique["unique_list"][i].index(x)) \
                                       for i in gmt_unique.index]
    #string concatenation of ordered top tf-idf terms, to use as part of tf-idf shortened title
    gmt_unique["key_terms"] = gmt_unique["key_terms_ordered"].apply(" ".join)
    matched_titles = gmt[["titles"]].merge(gmt_unique[["unique_titles", "key_terms"]], how = 'left', 
                                           left_on = "titles", right_on = "unique_titles")
    
    for d,_ in direction:
        print(f"preparing {d} file for {species}...")
        new_gmt = pd.read_csv(f"{species}_{d}_gene_set_raw.gmt", header = None, sep = "\t")
        new_term_names = matched_titles["key_terms"] + " " + new_gmt[0]
        
        # creating modified [human, mouse] x [up, down] gene set library gmt files
        # row: {shortened_gse_titles {gse id}_{signature number}}\t{gse title}\tGENE_1\tGENE_2\t...\tGENE_n
        pd.concat([new_term_names, new_gmt[new_gmt.columns[1:]]], axis = 1).to_csv(f"{species}_{d}_gene_set.gmt", 
                                                                                   index = False, header = False, sep = "\t")
        print(f"done preparing {d} file for {species}")
    print(f"term name length distribution for {species}:")
    print(new_term_names.apply(len).value_counts(bins=range(0, 110, 10)).sort_index())
    print('done with', species)