# Preprocessing Xenopus AC data

## Loading

In [2]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#import scvelo as scv
from scipy import stats
import os
import matplotlib
import pickle
%matplotlib inline
import palantir
import harmony

import random
np.random.seed(0)
random.seed(0)

# Preprocessing Klein data

## Loading

In [3]:

exp = np.load('expression.npy')

In [4]:
anno = pd.read_csv('anno.tsv', sep = '\t', index_col = 0)

In [5]:

genes = pd.read_csv("genes.tsv", sep='\t', header = None)[0]
barcodes = pd.read_csv("barcodes.tsv", header = None)[0]

In [6]:
counts = pd.DataFrame(exp, index = genes, columns = barcodes)
counts = counts.T
norm_df = harmony.utils.normalize_counts(counts)
hvg_genes_klein = harmony.utils.hvg_genes(norm_df, no_genes=10000)

In [10]:
counts = norm_df.T

## Mapping to human

In [11]:
mapper = pd.read_csv('Klein_xen_to_human.txt', sep = '\t')
mapper = {x:h for x, h in zip(mapper['X. Tr 9.0 gene symbol'], mapper['Matched human gene symbol'])}



In [12]:
counts.index = counts.index.map(mapper).tolist()

In [13]:
counts = counts.loc[~counts.index.isna()]

In [14]:
counts = counts[~counts.index.duplicated(keep='first')]

In [15]:
counts

Unnamed: 0,GACGATTGAT-AAAGTCGG,AGTTTACGT-GGGAGGTA,CATCGCAG-CGAGATGT,GATTGATCTA-TAAGACGG,GATCGGTTTA-AAGATTGT,AATGCGGAT-CGGCTTAC,AAAAGTCGG-AATGAATG,TGACGTATCGG-CTTTAATC,GACCTGACAC-AGCAGAAC,GATGTTCCAG-CAGTCCCT,...,CAAGGTAC-TACAAACT,GTACGCTT-CGGACAAC,GGAAGTCC-TAGTCGCA,CCATATGA-GGCTACTA,GTCCGTCA-TTATCTGT,TAAACCGA-CGTGGATA,CGTCGAAT-TAGTGGAC,TAGTCTCT-CGGCTTAC,GTATACGT-ATATGCAT,TCTGATTT-CCTATTCA
42SP43,1.936296,0.588166,0.617665,5.111168,2.094241,2.370043,0.909091,4.637789,1.477323,0.772141,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000
42SP50,0.484074,0.000000,4.323657,0.000000,0.698080,1.580028,2.727273,1.855115,0.000000,2.316423,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000
LOC100145494,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000
MGC80885,0.968148,0.000000,0.000000,0.000000,0.698080,0.000000,0.000000,0.927558,0.000000,0.000000,...,0.000000,2.613696,4.110152,0.000000,0.000000,0.00000,0.000000,7.178751,0.000000,0.000000
MT-CO3,15.006293,12.939654,24.706609,66.445183,8.376963,7.900142,30.000000,13.913366,20.682523,21.619952,...,43.149946,5.227392,16.440608,8.257638,22.522523,55.27916,25.396825,0.000000,58.675608,12.330456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZWILCH,0.000000,0.588166,0.617665,0.000000,0.000000,0.790014,0.000000,0.000000,0.000000,0.772141,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000
ZXDC,0.000000,0.000000,0.000000,0.638896,0.000000,0.000000,0.000000,0.000000,0.000000,4.632847,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,12.330456
ZYG-11,0.968148,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000
ZZEF1,0.000000,0.000000,0.000000,1.916688,0.000000,0.000000,0.909091,0.927558,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000


In [16]:
anno['Dev_group'] = anno.Developmental_stage.map(
{'Stage_8':'Blastula', 
 'Stage_10':'Blastula', 
 'Stage_11':'Blastula', 
 'Stage_12':'Blastula', 
 'Stage_13':'Neurula', 
 'Stage_14':'Neurula', 
 'Stage_16':'Neurula', 
 'Stage_18':'Neurula', 
 'Stage_20':'Neurula', 
 'Stage_22':'Early tailbud'})

In [17]:
adata = sc.AnnData(counts.T)

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [18]:
adata

AnnData object with n_obs × n_vars = 136966 × 19910

In [19]:
sc.pp.filter_genes(adata, min_cells=3)

  if not is_categorical(df_full[k]):
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [20]:
adata.obs = anno
anno

Unnamed: 0_level_0,InDrops_version,Replicate_name,Clutch_name,Library_name,Barcode_name,Developmental_stage,Cluster_name,Parent_cluster_name,Dev_group
Barcode_sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GACGATTGAT-AAAGTCGG,v2,Timeseries_1,Clutch_1,Library_1,bc001,Stage_8,S08-blastula,Root,Blastula
AGTTTACGT-GGGAGGTA,v2,Timeseries_1,Clutch_1,Library_1,bc002,Stage_8,S08-blastula,Root,Blastula
CATCGCAG-CGAGATGT,v2,Timeseries_1,Clutch_1,Library_1,bc003,Stage_8,S08-blastula,Root,Blastula
GATTGATCTA-TAAGACGG,v2,Timeseries_1,Clutch_1,Library_1,bc004,Stage_8,S08-blastula,Root,Blastula
GATCGGTTTA-AAGATTGT,v2,Timeseries_1,Clutch_1,Library_1,bc005,Stage_8,S08-blastula,Root,Blastula
...,...,...,...,...,...,...,...,...,...
TAAACCGA-CGTGGATA,v3,Timeseries_2,Clutch_6,Library_75,bcHBKV,Stage_22,S22-somite,S20-somite,Early tailbud
CGTCGAAT-TAGTGGAC,v3,Timeseries_2,Clutch_6,Library_75,bcEHFT,Stage_22,S22-presomitic mesoderm,S20-presomitic mesoderm,Early tailbud
TAGTCTCT-CGGCTTAC,v3,Timeseries_2,Clutch_6,Library_75,bcHLOY,Stage_22,S22-presomitic mesoderm,S20-presomitic mesoderm,Early tailbud
GTATACGT-ATATGCAT,v3,Timeseries_2,Clutch_6,Library_75,bcGOPS,Stage_22,S22-optic vesicle,S20-eye primordium,Early tailbud


In [21]:
#klein_anno.Cluster_name.unique()

nne = [
'S11-ionocyte', 'S12-ionocyte', 'S13-ionocyte', 'S14-ionocyte', 'S16-ionocyte', 'S18-ionocyte', 'S20-ionocyte', 'S22-alpha ionocyte', 'S22-beta ionocyte',



'S11-ciliated epidermal progenitor', 'S12-ciliated epidermal progenitor', 'S13-ciliated epidermal progenitor', 'S14-ciliated epidermal progenitor', 'S16-ciliated epidermal progenitor', 'S18-ciliated epidermal progenitor', 'S20-ciliated epidermal progenitor', 'S22-ciliated epidermal progenitor',

'S11-goblet cell', 'S12-goblet cell', 'S13-goblet cell', 'S14-goblet cell', 'S16-goblet cell', 'S18-goblet cell', 'S20-goblet cell', 'S22-goblet cell',

'S12-cement gland primordium', 'S13-cement gland primordium', 'S14-cement gland primordium', 'S16-cement gland primordium', 'S18-cement gland primordium', 'S20-cement gland primordium', 'S22-cement gland primordium',


'S14-hatching gland', 'S16-hatching gland', 'S18-hatching gland', 'S20-hatching gland', 'S22-hatching gland',

'S14-small secretory cells', 'S16-small secretory cells', 'S18-small secretory cells', 'S20-small secretory cells', 'S22-small secretory cells',

'S10-non-neural ectoderm', 'S11-non-neural ectoderm', 'S12-non-neural ectoderm', 'S13-non-neural ectoderm',

'S14-epidermal progenitor - tp63/ctbs', 'S16-epidermal progenitor - tp63/ctbs', 'S18-epidermal progenitor - tp63/ctbs', 'S20-epidermal progenitor - tp63/ctbs', 'S22-epidermal progenitor - tp63/ctbs',

'S14-epidermal progenitor - tp63/tll2', 'S16-epidermal progenitor - tp63/tll2', 'S18-epidermal progenitor - tp63/tll2', 'S20-epidermal progenitor - tp63/tll2', 'S22-epidermal progenitor - tp63/tll2',

'S18-epidermal - aqp3',

'S18-otic placode', 'S20-otic placode', 'S22-otic placode',

'S18-placodal neuron - eya2/neurog1/neurod1', 'S20-placodal neuron - eya2/neurog1/neurod1', 'S22-placodal neuron - eya2/neurog1/neurod1',

'S18-epibranchial and lateral line placodes', 'S20-epibranchial and lateral line placodes', 'S22-epibranchial and lateral line placodes',

'S18-posterior placodal area', 'S20-posterior placodal area', 'S22-posterior placodal area',

'S18-adenohypophyseal placode', 'S20-adenohypophyseal placode', 'S22-adenohypophyseal placode',

'S13-placodal area',

'S14-anterior placodal area', 'S16-anterior placodal area', 'S18-anterior placodal area', 'S20-anterior placodal area', 'S22-anterior placodal area',

'S18-trigeminal and profundal placodes', 'S20-trigeminal and profundal placodes', 'S22-trigeminal and profundal placodes',

'S18-olfactory placode', 'S20-olfactory placode', 'S22-olfactory placode',

'S18-lens placode', 'S20-lens placode', 'S22-lens placode'
]

In [22]:
anno['Cell-type'] = [x[4:] for x in anno['Cluster_name']]

In [23]:
klein_sub_anno = anno.loc[[x in nne for x in anno.Cluster_name]]

In [24]:
adata = adata[[x in nne for x in anno.Cluster_name]]

  if not is_categorical(df_full[k]):


In [25]:
adata.obs = klein_sub_anno

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [33]:
klein_sub_anno['Dataset'] = 'Klein'
klein_sub_anno['Species'] = 'Xenopus T'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [28]:
adata.to_df().to_csv('klein_nne_counts.tsv', sep = '\t')

In [34]:
klein_sub_anno.to_csv('klein_nne_meta.tsv', sep = '\t')