In [4]:
import pandas as pd
import fsspec

def DataFile(x):
    return x

CHROME_USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"

DECAY_CONSTANT_FILES = {'Neymotin2014': ('tsv', "Syst", "thalf", None),
                        'Chan2018': ('tsv', "gene_id", ["halflife_160412_r1", "halflife_160412_r2"], None),
                        'Geisberg2015': ('excel', "systematic name", "Half-Life           (in minutes)", 'openpyxl'),
                        'Munchel2011': ('excel', "Systematic Name", "Half-life [min]", 'xlrd'),
                        'Miller2011': ('tsv', "X1", "wt", None)}
                        
DECAY_CONSTANT_LINKS = {'Neymotin2014': "https://rnajournal.cshlp.org/content/suppl/2014/08/08/rna.045104.114.DC1/TableS5.xls",
                        'Chan2018': "https://cdn.elifesciences.org/articles/32536/elife-32536-fig1-data2-v4.txt",
                        'Geisberg2015': "https://www.cell.com/cms/10.1016/j.cell.2013.12.026/attachment/5d358c57-4ca0-4216-be37-3cc5c909b375/mmc1.xlsx",
                        'Munchel2011': "https://www.molbiolcell.org/doi/suppl/10.1091/mbc.e11-01-0028/suppl_file/mc-e11-01-0028-s10.xls",
                        'Miller2011': "https://www.embopress.org/action/downloadSupplement?doi=10.1038%2Fmsb.2010.112&file=msb2010112-sup-0001.txt"}
                        

In [5]:
def _process_all_links(genes):
    
    return pd.concat([_process_link(genes, x)
                      for x in DECAY_CONSTANT_LINKS.keys()],
                     axis=1)

def _process_link(genes, dataset):
        
    file_type, gene_col, hl_col, engine = DECAY_CONSTANT_FILES[dataset]
    
    with fsspec.open(DECAY_CONSTANT_LINKS[dataset], client_kwargs = {'headers': {'User-Agent': CHROME_USERAGENT}}) as f:
        if file_type == 'tsv':
            df = pd.read_csv(f, sep="\t", index_col=0 if gene_col == "X1" else None)
        elif file_type == 'excel':
            df = pd.read_excel(f, engine=engine)
        else:
            raise ValueError("Bad file_type")
    
    df, hl_col = _process_df_hl(df, genes, gene_col, hl_col)
    df.rename({hl_col: dataset}, axis=1, inplace=True)
    
    return df[[dataset]]


def _process_df_hl(df, genes, gene_col, hl_col):

    if gene_col == "X1":
        df.index.name = "X1"
        df.reset_index(inplace=True)

    if isinstance(hl_col, list):
        df['means'] = df[hl_col].mean(axis=1)
        hl_col = 'means'

    df = df[[gene_col, hl_col]].groupby(gene_col).agg('mean')
    df = df.reindex(genes, axis=0)
    
    return df, hl_col

In [6]:
_process_link(["YPR036W-A", "YOR063W"], 'Geisberg2015')

  warn(msg)


Unnamed: 0_level_0,Geisberg2015
systematic name,Unnamed: 1_level_1
YPR036W-A,39.929172
YOR063W,13.747064


In [None]:
_process_link(["YPR036W-A", "YOR063W"], 'Chan2018')

In [None]:
_process_link(["YPR036W-A", "YOR063W"], 'Miller2011')

In [None]:
dataset = "Miller2011"
with fsspec.open(DECAY_CONSTANT_LINKS[dataset], client_kwargs = {'headers': {'User-Agent': CHROME_USERAGENT}}) as f:
    df = pd.read_csv(f, sep="\t")

In [None]:
df.c

In [None]:
_process_link(["YPR036W-A", "YOR063W"], 'Munchel2011', file_type="excel")

In [None]:
_process_all_links(["YPR036W-A", "YOR063W"])