参考https://nbviewer.org/github/KrishnaswamyLab/MELD/blob/main/notebooks/Wagner2018_Chordin_Cas9_Mutagenesis.ipynb

In [None]:
# pip install --user meld phate magic-impute cmocean diffxpy seaborn

In [None]:
import pandas as pd
import numpy as np
import graphtools as gt
import phate
import magic
import scprep
import meld
import cmocean
import sklearn
import scipy
import seaborn as sns

# setting defaults for matplotlib font sizes
import matplotlib.pyplot as plt
plt.rc('font', size=14)

# making sure plots & clusters are reproducible
np.random.seed(42)

%load_ext autoreload
%autoreload 2

import diffxpy.api as de

In [None]:
np.random.seed(0)

### 1.import Zebrafish data
This data is available from GEO Series GSE112294

In [None]:
# sample_info = [('GSM3067201', 'chd', 'A'), ('GSM3067202', 'chd', 'B'), ('GSM3067203', 'chd', 'C'),
#            ('GSM3067204', 'tyr', 'A'), ('GSM3067205', 'tyr', 'B'), ('GSM3067206', 'tyr', 'C')]
# counts_url = 'ftp://ftp.ncbi.nlm.nih.gov/geo/samples/' \
#              'GSM3067nnn/{accession}/suppl/{accession}_{genotype}{replicate}' \
#              '.csv.gz'
# clusters_url = 'ftp://ftp.ncbi.nlm.nih.gov/geo/samples/' \
#                'GSM3067nnn/{accession}/suppl/{accession}_{genotype}{replicate}_' \
#                'clustID.txt.gz'
# cluster_names_url = 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE112nnn/GSE112294/' \
#                     'suppl/GSE112294_ClusterNames.csv.gz'
# sparse = True
# counts_matrices = [] 
# batch_labels = []
# metadata = []
# for accession, genotype, replicate in sample_info:
#     curr_label = '{}{}'.format(genotype, replicate)
#     print('Downloading {accession}.{genotype}.{replicate} ...'.format(accession=accession, genotype=genotype, replicate=replicate))
    
#     batch_labels.append(curr_label)
    
#     data = scprep.io.load_csv(counts_url.format(accession=accession, genotype=genotype, replicate=replicate),
#                               sparse=sparse, cell_axis='column')
#     counts_matrices.append(data)
    
#     clusters = scprep.io.load_csv(clusters_url.format(accession=accession, genotype=genotype, replicate=replicate),
#                        cell_names=data.index, gene_names=['clusterID'], sparse=sparse)
#     metadata.append(clusters)

# data, sample_labels = scprep.utils.combine_batches(counts_matrices, batch_labels, append_to_cell_names=True)
# metadata, _ = scprep.utils.combine_batches(metadata, batch_labels, append_to_cell_names=True)
# data.head()

In [None]:
#读不进去就算了，我手动下载一下
sample_info = [('GSM3067201', 'chd', 'A'), ('GSM3067202', 'chd', 'B'), ('GSM3067203', 'chd', 'C'),
               ('GSM3067204', 'tyr', 'A'), ('GSM3067205', 'tyr', 'B'), ('GSM3067206', 'tyr', 'C')]
data = pd.DataFrame()
metadata = pd.DataFrame()

for sample in sample_info:
    counts_path = f'/Users/mhuang/code/python/abundance/data/zebrafish/{sample[0]}_{sample[1]}{sample[2]}.csv.gz'
    clusterID_path = f'/Users/mhuang/code/python/abundance/data/zebrafish/{sample[0]}_{sample[1]}{sample[2]}_clustID.txt.gz'
    
    ## 1.counts
    df = pd.read_csv(counts_path, compression='gzip', sep=',', index_col='Row')
    df = df.T
    df.index = df.index + f'_{sample[1]}{sample[2]}'
    
    data = pd.concat([data, df])
    
    ## 2.metadata
    meta = pd.read_csv(clusterID_path, compression='gzip', header=None)
    meta.index = df.index
    meta.columns=['clusterID']
    meta['sample_labels'] = f'{sample[1]}{sample[2]}'
    
    metadata = pd.concat([metadata, meta])
    
metadata['genotype'] = ['chd' if label.startswith('chd') else 'tyr' for label in metadata['sample_labels']]

In [None]:
ClusterNamesMaps = pd.read_csv("/Users/mhuang/code/python/abundance/data/zebrafish/GSE112294_ClusterNames.csv",index_col='ClusterID')
ClusterNamesMaps['ClusterName'] = ClusterNamesMaps['ClusterName'].str.slice(6)

In [None]:
cluster_names = ClusterNamesMaps['ClusterName'].loc[metadata['clusterID']]
cluster_names.index = metadata.index
metadata['cluster'] = cluster_names
metadata.head()

### 2.preprocess

In [None]:
#Filtering
## 1.Removing dead cells
## 2.Filtering by library size
## 3.Removing genes that are expressed in relatively few cells.
data = scprep.filter.filter_rare_genes(data)
# scprep.plot.plot_library_size(data, cutoff=15000);
data, metadata = scprep.filter.filter_library_size(
    data, metadata, cutoff=15000, 
    keep_cells='below')
# scprep.plot.plot_gene_set_expression(data, genes=['LOC101885394'], log='y', cutoff=164)
data, metadata = scprep.filter.filter_gene_set_expression(data, metadata, genes=['LOC101885394'], cutoff=164)

In [None]:
#Normalization
data_libnorm, libsize = scprep.normalize.library_size_normalize(data, return_library_size=True)
metadata['library_size'] = libsize

In [None]:
#Square root transformation
data_sqrt = np.sqrt(data_libnorm)