# Prepare workspace

In [None]:
#Import relevant packages
import numpy as np
import pandas as pd
from matplotlib import rcParams
import os
import scanpy as sc
import sys
import anndata
from collections import defaultdict

import matplotlib as mpl
import matplotlib.pyplot as plt

#For nice color schemes
import cmocean

#For barplots 
import seaborn as sns

In [None]:
#Import rnasieve for deconvolution of bulk RNA-Seq 
from rnasieve.preprocessing import model_from_raw_counts

In [None]:
#For better graphing
import altair as alt
from rnasieve.algo import find_mixtures

In [None]:
#Setwd 
os.chdir('/hpc/group/goldsteinlab/Python')

In [None]:
#Set fontsize
plt.rcParams.update({'font.size': 15})

In [None]:
#Show full pandas dataframe when produced
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

# Load data

In [None]:
#Read in csv counts matrix
#This contains bulk RNA Seq count data with gene name in index column
#Each subsequent column is 1 bulk sample
df = pd.read_csv('19_ONB_plus_3_control_filtered.csv', index_col = 0)

In [None]:
#Create obs
l1 = [t.split('_')[1] for t in df.columns]
np.unique(l1)
pd.Series(l1).value_counts()


obs = pd.DataFrame(index = df.columns)
obs['Tumor_type'] = 'ONB'
obs['Tumor_ID'] = l1
obs

In [None]:
#Create genes dataframe for var
df_genes = pd.DataFrame(index = df.index)

In [None]:
#Create anndata object
adata_bulk = anndata.AnnData(X = df.values.T, var = df_genes, obs = obs)
adata_bulk

In [None]:
#Read in reference single cell data
#this should be an anndata object with cluster names annotated by cell identity
adata_ref= anndata.read_h5ad('All_cells_annotated_16_samples.h5ad')

In [None]:
#check cluster names
#for example
adata_ref.obs.groupby(['cluster_names']).apply(len)

Now ready to proceed to running deconvolution model