In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import s3fs
import os

endpoint = 'https://d2h2.s3.amazonaws.com/'
base_url = 'data'
s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': endpoint})


def compute_vis(expr_df, meta_df):
    anndata_df = sc.AnnData(expr_df.T,dtype=np.float32)

    anndata_df.var['gene_names'] = expr_df.index.values
    anndata_df.obs['samples'] = expr_df.columns.values

    anndata_df.var['var_rank'] = (-np.var(anndata_df.X, axis=0, dtype="float")).argsort()
    anndata_df = anndata_df[:, anndata_df.var.var_rank < 5000]

    meta_df = meta_df[meta_df['Sample_geo_accession'].isin(expr_df.columns.values)]
    #anndata_df.obs['meta'] = anndata_df.obs_names.map(lambda x: meta_df[meta_df['Sample_geo_accession'] == x]['Condition'].values[0]).astype('category')

    sc.pp.pca(anndata_df, n_comps=2)


    sc.pp.neighbors(anndata_df) 

    df_y = meta_df

    df_y['pca_x'] = anndata_df.obsm['X_pca'][:,0]
    df_y['pca_y'] = anndata_df.obsm['X_pca'][:,1]

    n_samps = anndata_df.obsm['X_pca'].shape[0]

    sc.tl.tsne(anndata_df, perplexity=n_samps // 2)
    df_y['tsne_x'] = anndata_df.obsm['X_tsne'][:,0]
    df_y['tsne_y'] = anndata_df.obsm['X_tsne'][:,1]

    sc.tl.umap(anndata_df, n_components=2)
    df_y['umap_x'] = anndata_df.obsm['X_umap'][:,0]
    df_y['umap_y'] = anndata_df.obsm['X_umap'][:,1]

    df_y.set_index('Sample_geo_accession')

    df_y = df_y.round(decimals=2)
    return df_y

In [None]:
with open(f'../app/static/data/metadata-v2.pickle', 'rb') as f:	
		gse_metadata = pickle.load(f)

In [None]:
human_done = set(filter(lambda x: x!= '.DS_Store', map(lambda x: x.split('-')[0], os.listdir('data_coords/human'))))
mouse_done = set(filter(lambda x: x!= '.DS_Store', map(lambda x: x.split('-')[0], os.listdir('data_coords/mouse'))))
COORDS_computed = {'human': human_done, 'mouse': mouse_done}

species_list = ['human', 'mouse']

for species in species_list:
    for gse in tqdm(gse_metadata[species]):
             if gse not in COORDS_computed[species]:
                metadata_file = f"{base_url}/{species}/{gse}/{gse}_Metadata.tsv"
                meta_df = pd.read_csv(s3.open(metadata_file), sep='\t', index_col=0)
                try:
                    expr_file = f"{base_url}/{species}/{gse}/{gse}_Expression.tsv"
                    expr_df = pd.read_csv(s3.open(expr_file), sep='\t', index_col=0)
                    expr_df = expr_df[(expr_df < np.inf) & (expr_df >= 0)].dropna().astype(float)
                    if len(expr_df.columns) > 2:
                        meta_df_new = compute_vis(expr_df, meta_df.reset_index())
                        os.makedirs(f'data_coords/{species}/{gse}', exist_ok=True)
                        meta_df_new.to_csv(f'data_coords/{species}/{gse}/{gse}_Metadata.tsv', sep='\t')
                        expr_df.to_csv(f'data_coords/{species}/{gse}/{gse}_Expression.tsv', sep='\t')
                except Exception as e:
                     print(gse, e)
                     break
               