# Import packages and data

In [None]:
import os
import numpy as np
import pandas as pd
import datetime
from pathlib import Path
import scanpy as sc
import re
from pprint import pprint

import matplotlib as mpl
import matplotlib.pyplot as plt
import cmocean
import seaborn as sns
from mpl_toolkits import mplot3d
%matplotlib inline  

sc.settings.verbosity = 4

In [None]:
adata=sc.read_h5ad('COVID_dataset_scvi_1.h5ad')

# Get OR info per cell

In [None]:
def get_name(s):
    genes = [x.split(" ")[-1].strip('"') for x in s.split(";") if "gene_name" in x]
    if len(genes) > 1:
        raise ValueError("Found multiple genes")
    if len(genes) > 0:
        return genes[0]

In [None]:
gtf = pd.read_table(
    "https://static-content.springer.com/esm/art%3A10.1186%2Fs12864-020-6583-3/MediaObjects/12864_2020_6583_MOESM5_ESM.txt",
    sep="\t",
    comment="#",
    header=None,
)
gtf["gene"] = gtf[8].apply(get_name)
print(gtf["gene"].nunique())

In [None]:
human_anno = pd.read_excel(
    "https://static-content.springer.com/esm/art%3A10.1186%2Fs12864-020-6583-3/MediaObjects/12864_2020_6583_MOESM2_ESM.xlsx",
    sheet_name=0,
)
human_anno.columns = human_anno.columns.map(lambda l: l.replace(" ", "_").lower())

bm_human = (
    human_anno[["gene_symbol", "gene_name", "chromosome", "strand", "ensembl_gene_id"]]
    .drop_duplicates()
    .reset_index(drop=True)
    .set_index("gene_symbol")
)

ct = (
    human_anno[human_anno.gene_symbol.isin(bm_human.index)]
    .groupby("gene_symbol")
    .transcript_biotype.value_counts()
    .unstack()
    .replace(np.nan, 0)
)

bm_human = bm_human.join(ct[["protein_coding", "unprocessed_pseudogene"]])
print(bm_human.protein_coding.value_counts())

func_human = bm_human[["ensembl_gene_id"]].reset_index()
func_human.columns = ["gene", "Ens"]

In [None]:
is_olfr = adata.var.gene_ids.isin(bm_human.ensembl_gene_id)
print(is_olfr.sum())
adata.obs["olfr_max"] = adata.X[:, is_olfr].max(1).A.flatten()

adata.var[is_olfr].nlargest(10, "total_counts")

In [None]:
is_OR_mt_gene = (adata.var.mito) | (adata.var.ribo) | (is_olfr)

In [None]:
sc.pl.scatter(adata, x='total_counts', y='n_genes')
plt.hist(adata.obs.n_genes, bins=100, lw=0, log=True);
plt.axvline(250, ls="--", color="red")